# Imports

In [55]:
import pandas as pd
import numpy as np

import unicodedata
import string

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

from IPython.core.display import display, HTML

# Data

In [56]:
df = pd.read_csv('data/postings.csv')

In [57]:
resume = None

with open('data/Liveproject Resume.txt') as file_handle:
    resume = file_handle.read()

In [58]:
df.head()

Unnamed: 0,job_posting,title,body,bullets
0,3157fcef3ee474da_fccid.html,"Data Scientist - Mountain View, CA","Data Scientist - Mountain View, CA\nGroundTrut...","('Help senior members of the team to explore, ..."
1,b423ca22a6e2c10f_fccid.html,"Data Scientist - Seattle, WA","Data Scientist - Seattle, WA\nA Bachelor or Ma...",('A Bachelor or Masters Degree in a highly qua...
2,a559b6630c13783d_fccid.html,"Junior Data Scientist - College Park, MD 20740","Junior Data Scientist - College Park, MD 20740...",('Degree: Bachelor’s degree in business analyt...
3,f579e807b5804620_fccid.html,"Data Scientist - New York, NY","Data Scientist - New York, NY\nDescription\nDS...","('Languages: Python, PySpark, SQL', 'Data Tool..."
4,13c9ffc0bcb07c8d_fccid.html,"(Entry-Level) Data Scientist - Chicago, IL","(Entry-Level) Data Scientist - Chicago, IL\nDa...",('Be the go-to person for Data ingest and stor...


In [59]:
df.shape

(547, 4)

# Data Cleaning

In [60]:
stop_words = stopwords.words('english')

In [61]:
def clean_text(text):

    p_text = text.translate(str.maketrans('', '', string.punctuation))
    p_text = p_text.translate(str.maketrans('', '', string.digits))
    p_text = unicodedata.normalize('NFKD', p_text)\
                        .encode('ascii', 'ignore')\
                        .decode('utf-8', 'ignore')\
                        .replace('\n\t', ' ')\
                        .replace('\n', ' ')\
                        .replace('  ', ' ')\
                        .replace('   ', ' ')\
                        .rstrip()\
                        .lower()

    p_text = ' '.join([word for word in word_tokenize(p_text) if word not in stop_words])
    return p_text

In [66]:
df['requirements'] = df.bullets.values

In [68]:
df.requirements = df.requirements.apply(lambda x: x[1:-1])

In [73]:
df.requirements

0      help senior members team explore develop produ...
1      bachelor masters degree highly quantitative fi...
2      degree bachelors degree business analytics dat...
3      languages python pyspark sql data tools spark ...
4      goto person data ingest storage across cloud h...
                             ...                        
542    applicable degrees computer information system...
543    experience python knowledge following numpy sc...
544                                                     
545    independently develop advanced analytics predi...
546    experienced handling large data sets using sql...
Name: requirements, Length: 547, dtype: object

In [74]:
df.requirements = df.requirements.apply(clean_text)

In [75]:
df.requirements.head(n=1).values

array(['help senior members team explore develop productionize optimize machine learning algorithms pipelines use hadoop spark amazon athena daily basis explore petabytes data dive deep rich set location data derive insights build product prototypes collaborate peer data scientists engineers product managers closely master degree computer science statistics mathematics engineering phd plus experience statistics machine learning fluency python significant experience sql relational databases nosql data stores familiarity open source machine learning libraries scikitlearn spark mllib experience amazon web services plus excellent communication skills'],
      dtype=object)

In [76]:
print(resume)

﻿Good Student
Data Scientist
	  

Good Student
123 Fake Street
Some City, QT 12345
123.456.7890
no_reply@fakesite.com
	ㅡ
Skills
	  

Python, Pandas, machine learning, natural language processing
	ㅡ
Experience
	  

Manning / Data Analyst
Oct 2019 - PRESENT,  REMOTE
Analyzed and visualized vast amounts of data using Pandas, Python, and Matplotlib.
	ㅡ
Education
	  

Berkeley / B.S. Mathematics
August 2015 - May 2019,  BERKELEY, CA
Graduated summa cum laude.

	ㅡ
Awards
	  

Tau Beta Pi Honors Society



In [77]:
resume_clean = clean_text(resume)

In [78]:
print(resume_clean)

good student data scientist good student fake street city qt noreplyfakesitecom skills python pandas machine learning natural language processing experience manning data analyst oct present remote analyzed visualized vast amounts data using pandas python matplotlib education berkeley bs mathematics august may berkeley ca graduated summa cum laude awards tau beta pi honors society


# NLP

In [79]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [80]:
all_text = [resume_clean] + list(df.requirements.values)

In [81]:
tfidf_matrix = tfidf_vectorizer.fit_transform(all_text)

In [82]:
tfidf_matrix.shape

(548, 6916)

In [83]:
tfidf_matrix

<548x6916 sparse matrix of type '<class 'numpy.float64'>'
	with 68149 stored elements in Compressed Sparse Row format>

In [84]:
tfidf_matrix[1:]

<547x6916 sparse matrix of type '<class 'numpy.float64'>'
	with 68104 stored elements in Compressed Sparse Row format>

In [85]:
tfidf_np_array = tfidf_matrix.toarray()

In [86]:
tfidf_np_array.shape

(548, 6916)

In [87]:
resume_vector = tfidf_np_array[0]

In [88]:
resume_vector

array([0., 0., 0., ..., 0., 0., 0.])

In [89]:
resume_non_zero_indicese = np.flatnonzero(resume_vector)

In [90]:
resume_non_zero_indicese

array([ 262,  276,  291,  498,  536,  622,  627,  727,  762,  925, 1411,
       1458, 1920, 2207, 2303, 2633, 2661, 2841, 3376, 3395, 3425, 3613,
       3669, 3722, 3732, 4055, 4183, 4255, 4432, 4597, 4767, 4829, 5015,
       5033, 5255, 5527, 5737, 5779, 6000, 6021, 6077, 6194, 6627, 6674,
       6741])

In [95]:
tfidf_vectorizer.get_feature_names()[6741]

'visualized'

In [96]:
resume_vector[439]

0.0

In [97]:
cosine_similarities = tfidf_np_array @ resume_vector

In [98]:
cosine_similarities.shape

(548,)

In [104]:
np.argsort(cosine_similarities)

array([312, 258, 528, 425, 398, 362, 330, 434,  84, 514, 214, 211, 357,
       510, 207, 202, 122, 189, 183, 126, 179, 390, 345, 149,  43, 283,
       331,  18, 308, 291, 545, 537, 208,  10, 269, 243, 540,  61, 153,
       109, 314, 170, 154, 127, 286, 431, 396, 176, 288, 421, 499, 435,
       192, 530, 216,  32, 132, 199, 526, 278, 428, 247, 422,  31, 203,
       501, 474, 296, 473, 190, 493, 439, 365,  95, 488, 152, 504,  94,
        89,  13, 295, 105, 492, 352, 250, 238, 502, 134, 478, 182,   6,
       407, 534, 299, 417, 336, 452,  16, 517, 173, 438, 527, 316, 356,
        67, 332,   3,  99, 271, 108, 253, 209, 523, 531, 371, 423, 244,
       226,  42,  23, 412, 104,  49, 401, 451, 321, 342,  85, 264, 542,
       317, 475, 338, 369, 213, 456, 197,  37, 327, 393, 280,  22, 379,
       268, 339, 270, 267, 370, 382,  57,  38, 449, 160, 377, 521, 525,
        46, 164, 385,  87, 381, 307,  73, 260, 430, 500,  58,  81,   8,
       123, 470, 348,  25, 227, 375,  53, 329, 350,  50, 290, 12

In [105]:
sorted_postings = np.argsort(cosine_similarities)[::-1][1:]

In [106]:
print(df[df.index == sorted_postings[0]].job_posting.values)

['5ad1f895aff1a7bd_fccid.html']


In [109]:
sorted_postings[-1]

312

## The Highest Match

In [110]:
with open('data/html_job_postings/'+df[df.index == sorted_postings[0]].job_posting.values[0]) as jpost:
    display(HTML(jpost.read()))

## The Lowest Match

In [111]:
with open('data/html_job_postings/'+df[df.index == sorted_postings[-1]].job_posting.values[0]) as jpost:
    display(HTML(jpost.read()))