# Objective
Narrow down the set of job postings to those that are most similar to our resume in preparation for further analysis.

# Prepare corpus 

1. Import and remove the resume text of escape sequences.
2. Import the job posting dataset and clean the job requirement column.
3. Combine the job requirement column and the resume text to create a text corpus. 

In [1]:
import pathlib
import pandas as pd
import numpy as np

cur_path = pathlib.Path()
data_path = cur_path.joinpath('output/data.csv')
resume_path = cur_path.joinpath('data/LiveProject Resume.txt')

In [2]:
with open(resume_path) as r:
    resume = r.read()

import re
resume = re.sub('\\t|\\n|\\ufeff|ㅡ', ' ', resume) 
resume

' Good Student Data Scientist      Good Student 123 Fake Street Some City, QT 12345 123.456.7890 no_reply@fakesite.com    Skills      Python, Pandas, machine learning, natural language processing    Experience      Manning / Data Analyst Oct 2019 - PRESENT,  REMOTE Analyzed and visualized vast amounts of data using Pandas, Python, and Matplotlib.    Education      Berkeley / B.S. Mathematics August 2015 - May 2019,  BERKELEY, CA Graduated summa cum laude.     Awards      Tau Beta Pi Honors Society '

In [3]:
df = pd.read_csv(data_path)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 754 entries, 0 to 753
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   html      754 non-null    object
 1   position  754 non-null    object
 2   location  754 non-null    object
 3   company   110 non-null    object
 4   jobreq    753 non-null    object
dtypes: object(5)
memory usage: 29.6+ KB
None


In [4]:
# remove observation with null value for 'jobreq'
df.dropna(axis = 0, subset = ['jobreq'], inplace = True)
df.reset_index(inplace = True, drop = True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   html      753 non-null    object
 1   position  753 non-null    object
 2   location  753 non-null    object
 3   company   110 non-null    object
 4   jobreq    753 non-null    object
dtypes: object(5)
memory usage: 29.5+ KB
None


In [5]:
"""
Attach resume to df['jobreq'] and name the pd.Series 'corpus'.
The last entry is the resume text.
"""
corpus = df['jobreq'].append(pd.Series(resume)).reset_index(drop = True)
print(corpus)

0      Quantitative Analyst (State Street Bank and Tr...
1      Help senior members of the team to explore, de...
2      A Bachelor or Masters Degree in a highly quant...
3      Join a small team creating a proprietary NLU s...
4      Degree: Bachelor’s degree in business analytic...
                             ...                        
749    Applicable degree(s): Computer Information Sys...
750    Experience with Python Knowledge of some of th...
751    Independently develop advanced analytics and p...
752    Experienced in handling large data sets using ...
753     Good Student Data Scientist      Good Student...
Length: 754, dtype: object


# Compute document frequency of words appearing in the resume

Compute what proportion of job postings share the words that appeared in the resume text. 

1. Set the parameters of `sklearn.TfidfVectorizer` to 
    - stop commonly used English words (`stop_words = "english"`),
    - pick up words that are shared in at least 4 job postings but not appearing in more than 80% of the job postings (`min_df = 4, max_df = 0.8`),
    - and detect unigrams and bigrams (`ngram_range = (1, 2)`),
    - with ivnerse df enabled (`use_idf = True`).
1. Obtain inverse document frequenciese (idf) of the words used in the resume. 
2. Rank the words from the smallest to largest idf where smaller idf means the word is shared across large number of job postings and vice versa.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer(stop_words = "english", min_df = 4, max_df = 0.8, ngram_range = (1, 2)) 
tfidf_matrix = vectorizer.fit_transform(corpus) 

tfidf_np_matrix = tfidf_matrix.toarray() # convert to numpy array
print(tfidf_np_matrix)
print(tfidf_np_matrix.shape)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(754, 8305)


In [7]:
from collections import namedtuple

feature_extraction_result = namedtuple('corpus_features', 
                            ['csr_matrix', 'np_matrix', 'features', 'params'])

def resume_words_df():
    
    # Fit vectorizer
    vectorizer = TfidfVectorizer(stop_words = "english", min_df = 4, max_df = 0.8, 
                                 ngram_range = (1, 2), use_idf = True) 
    tfidf_matrix = vectorizer.fit_transform(corpus) 
    
    tfidf_np_matrix = tfidf_matrix.toarray()
    print("{0} documents and {1} features".format(*tfidf_np_matrix.shape))
    
    
    # Get the words in the resume
    words = vectorizer.get_feature_names()
    
    resume_vector = tfidf_np_matrix[-1] # the last document/row is the resume 
    nonzero_indices = np.flatnonzero(resume_vector)
    resume_words = [words[i] for i in nonzero_indices]
    print("Words in the resume:\n {}".format(resume_words))
    
    
    resume_df = {
        "words": resume_words,
        "tf-idf": resume_vector[nonzero_indices],
        "idf": vectorizer.idf_[nonzero_indices]
    }
    print(pd.DataFrame(resume_df).sort_values('tf-idf', ascending = True))
    
    return feature_extraction_result(csr_matrix = tfidf_matrix,
                          np_matrix = tfidf_np_matrix, 
                           features = words, params = vectorizer.get_params())

corpus_results = resume_words_df()   

754 documents and 8305 features
Words in the resume:
 ['2015', '2019', 'amounts', 'amounts data', 'analyst', 'analyzed', 'august', 'berkeley', 'ca', 'city', 'com', 'data analyst', 'data scientist', 'data using', 'education', 'good', 'language', 'language processing', 'learning', 'learning natural', 'machine', 'machine learning', 'mathematics', 'matplotlib', 'natural', 'natural language', 'pandas', 'pi', 'present', 'processing', 'processing experience', 'python', 'python matplotlib', 'python pandas', 'remote', 'scientist', 'skills', 'skills python', 'street', 'student', 'using', 'vast', 'vast amounts']
                    words    tf-idf       idf
36                 skills  0.041270  1.350074
18               learning  0.043909  1.436402
20                machine  0.045848  1.499849
21       machine learning  0.046049  1.506420
40                  using  0.050527  1.652908
22            mathematics  0.060350  1.974229
16               language  0.066639  2.179980
29             processi

In [8]:
import pickle 

with open('output/corpus_results.pickle', 'wb') as handle:
    pickle.dump(corpus_results.csr_matrix, handle, protocol = pickle.HIGHEST_PROTOCOL)
    pickle.dump(corpus_results.features, handle, protocol = pickle.HIGHEST_PROTOCOL)
    pickle.dump(corpus_results.params, handle, protocol = pickle.HIGHEST_PROTOCOL)

# Compute similarity between job postings and resume

I want to know which job posting is most similar to the resume.

1. Compute cosine similarity between `resume_vector` and `tfidf_np_matrix` with `sklearn.cosine_similarity` where larger value means higher similarity. 
3. Rank job postings from largest to smallest cosine and identify the top 10 job postings as the most similar job postings to our resume. 


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

cosines =  cosine_similarity(corpus_results.csr_matrix, corpus_results.csr_matrix[-1])
print("Cosine similarity of the document with the resume:\n {}".format(cosines))

Cosine similarity of the document with the resume:
 [[7.70543969e-02]
 [2.20957618e-02]
 [2.88940811e-02]
 [5.73653045e-02]
 [1.20006243e-03]
 [3.22765968e-02]
 [1.23749580e-02]
 [2.01166393e-02]
 [1.00612441e-02]
 [1.70852095e-02]
 [1.15410672e-02]
 [1.87172132e-02]
 [0.00000000e+00]
 [1.29208739e-02]
 [1.23197716e-02]
 [2.21875893e-02]
 [1.75037654e-02]
 [4.95756045e-03]
 [1.86904024e-02]
 [9.60793087e-03]
 [4.47603951e-02]
 [4.16510590e-02]
 [6.25255795e-03]
 [8.27446804e-02]
 [3.57619744e-02]
 [5.62390718e-03]
 [4.99634315e-02]
 [8.87546812e-02]
 [1.53551812e-02]
 [3.36359935e-03]
 [3.79537258e-02]
 [1.59438786e-02]
 [1.16182776e-02]
 [2.06866070e-02]
 [2.67607722e-02]
 [1.35904021e-02]
 [8.09604960e-03]
 [3.14733361e-02]
 [3.49645779e-02]
 [5.56740825e-02]
 [6.02485007e-03]
 [3.50948856e-02]
 [9.44247398e-03]
 [2.05166649e-02]
 [1.11890901e-02]
 [7.71451091e-02]
 [3.83244595e-02]
 [6.21776635e-03]
 [5.06255810e-03]
 [1.21123851e-02]
 [3.49067025e-02]
 [2.34075248e-02]
 [3.54094690

In [10]:
rank_descending = np.argsort(cosines, axis = 0).flatten()# rank from smallest to largest
print(rank_descending)

[194 415 304 382 741 109 508 107 277 313 226  12 406 101 547 729 250 127
 144  59 375 309 195 687   4 512  65  77 743 249 615 472 524 117 445 384
 437 443 715 202 170 593 140 674 740 260  29 646 635 557  52 466 131 376
 337 302 719 419 595 611 169 216 145 173 272 103 177  17  48 369 612 279
 357 723 353 749 585 657 262 270 332 676 110 248  25 717 598 727 438 295
 344 359  40 315  47  22 370 171 401 392 289 237 388 653 661  75 148 153
  62 133 162 348 299  53 731 293 444 543 287 330 555 113  78  66  36 163
 256 156 372 535 122 233 473 385 517 711 620 303 592  55 673 509 515 350
 474 701 112 493 128 460 142 569 640 423 563  42 476 625 105 281  19 575
 475 212 730 457  74 664   8 576  93 538 709 590  54  96 268 454 599 276
 686 355 589 616 120 253 282 690 429 614 688 424 681 286  44 531 748 441
 631 274  84 305 205  10 720 502  32 198 503 209 700  56 572 227 334 594
 203 154  49 586 522 628  14 516   6 130 667 134 403 496 627 231 240 696
 338 225 549  13 691 431 386  89 751 732 485  35 60

# Visualize cosine similarity of job postings to the resume 

In [11]:
import matplotlib.pyplot as plt

plt.figure(figsize = (8, 6))
plt.plot(cosines[rank_descending][::-1][1:])
plt.axvline(x = 100, linestyle = "--", color = "orange")
plt.title("Cosine similarity between the resume and job postings from largest to smallest")
plt.show()

<Figure size 800x600 with 1 Axes>

In [12]:
top100_most_similar_indice = rank_descending[::-1][1:101]
top100_most_similar = df.iloc[top100_most_similar_indice]
top100_most_similar['cosine_similarity_to_resume'] = cosines[top100_most_similar_indice]
print(top100_most_similar.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 352 to 366
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   html                         100 non-null    object 
 1   position                     100 non-null    object 
 2   location                     100 non-null    object 
 3   company                      15 non-null     object 
 4   jobreq                       100 non-null    object 
 5   cosine_similarity_to_resume  100 non-null    float64
dtypes: float64(1), object(5)
memory usage: 5.5+ KB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
top100_most_similar.head()

Unnamed: 0,html,position,location,company,jobreq,cosine_similarity_to_resume
352,data/html_job_postings/4b13ffbd35f56cf5_fccid....,machine learning / data scientist internship (...,"San Diego, CA",San Diego,How to build models at scale using vast amount...,0.136087
360,data/html_job_postings/17a320fb30c7c7ab_fccid....,machine learning postdoctoral fellow,"San Francisco Bay Area, CA",,Design and develop distributed machine learnin...,0.094365
491,data/html_job_postings/72435f49f9f550ef_fccid....,data analyst,"San Francisco, CA",,"Interpret data, analyze results and provide on...",0.090284
251,data/html_job_postings/ef7046b5961f249e_fccid....,data science summer institute student intern,"Livermore, CA 94550",,"Students will work with scientists, engineers ...",0.089635
27,data/html_job_postings/7ba056b7c3fbf8e7_fccid....,data scientist,"New York, NY 10011",,Conduct research on some of the world’s most i...,0.088755


In [14]:
top100_most_similar.to_csv("output/top100_most_similar.csv", index = False)