In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import gensim
from gensim import corpora, models, similarities

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords

from pymongo import MongoClient
from time import time
from collections import Counter, defaultdict
from textblob import TextBlob
import pandas as pd
import numpy as np
import operator
import re
import os
import pickle

# Load Resume

In [2]:
with open('BryantBiggs_mod.txt', 'r') as f:
    new_resume = f.read().strip()

In [3]:
new_resume

'BRYANT J. BIGGS\n                          3134 Avalon Way           *            Bloomingdale, New Jersey            *            bryantbiggs@gmail.com\n\nEDUCATION\nPurdue University - West Lafayette, IN\nBachelor of Science - M.E.T.\nReceived - December 19, 2009\t\t\t\nProgram GPA - 3.28/4.00\n\nArizona State University - Tempe, AZ\nBachelor of Science - Software Eng.\nExpected - February, 2017\t\t\nProgram GPA - 4.00/4.00\n\t    \n\nEXPERIENCE\nDATA SCIENTIST\n         Metis - New York City, NY. June, 2016 - Current\nAn immersive 12 week data science boot camp to learn and apply concepts such as data science, data mining, supervised (SVM, decision trees, random forests, KNN) and unsupervised (NLP, KMeans) machine learning, statistical inference, Bayesian statistics, regression analysis, data visualization, and cloud computing.\nTechnologies used/taught:\n  ? Python - Scikit-learn, SciPy, NumPy, Pandas, BeautifulSoup, Selenium, Matplotlib, Flask, Django\n  ? Javascript - D3.js, jQu

# City, State Abbreviation List

In [4]:
with open(r'pkl/cities.pkl', 'rb') as infile:
       cities = pickle.load(infile)

with open(r'pkl/abbr.pkl', 'rb') as infile:
       abbr = pickle.load(infile)

# Clean Text

In [5]:
# remove special characters
new_resume = re.sub(r'([-/*&%+();?.,@#$^])', ' ', new_resume)

cleaner = ['\xa0']
for c in cleaner:
    new_resume = new_resume.replace(c,'')

# remove multiple white spaces
new_resume = re.sub('\n',' ', new_resume)
new_resume = re.sub('\t',' ', new_resume)
new_resume = re.sub(r'''[^0-9a-zA-Z ]+''', '', new_resume)
new_resume = re.sub(' +',' ', new_resume)

In [6]:
new_resume

'BRYANT J BIGGS 3134 Avalon Way Bloomingdale New Jersey bryantbiggs gmail com EDUCATION Purdue University West Lafayette IN Bachelor of Science M E T Received December 19 2009 Program GPA 3 28 4 00 Arizona State University Tempe AZ Bachelor of Science Software Eng Expected February 2017 Program GPA 4 00 4 00 EXPERIENCE DATA SCIENTIST Metis New York City NY June 2016 Current An immersive 12 week data science boot camp to learn and apply concepts such as data science data mining supervised SVM decision trees random forests KNN and unsupervised NLP KMeans machine learning statistical inference Bayesian statistics regression analysis data visualization and cloud computing Technologies used taught Python Scikit learn SciPy NumPy Pandas BeautifulSoup Selenium Matplotlib Flask Django Javascript D3 js jQuery SQL and NoSQL databases PostgreSQL MongoDB Apache Hadoop MapReduce and Spark APPLICATION ENGINEER Saint Gobain Performance Plastics Wayne NJ June 2014 Current Product design for PTFE based

# Remove StopWords

In [7]:
# cache stopwords first to reduce compute time
cachedStopWords = stopwords.words("english")
cachedStopWords += ['tot']
cachedStopWords += cities
cachedStopWords += abbr
cachedStopWords = list(set(cachedStopWords))

In [8]:
# convert all text to lower case and separate into list
res_text = [x for x in new_resume.lower().split()]

# remove stopwords
res_text = ' '.join([item for item in res_text if item not in cachedStopWords])
print(len(res_text.split()))

300


In [9]:
res_text

'j 3134 bryantbiggs gmail com education bachelor e received december 19 2009 program gpa 3 28 4 00 bachelor software eng expected february 2017 program gpa 4 00 4 00 experience data scientist metis 2016 current immersive 12 week data boot learn apply concepts data data mining supervised svm decision trees forests knn unsupervised nlp kmeans machine learning statistical inference bayesian statistics regression analysis data visualization computing technologies used taught python scikit learn scipy numpy pandas beautifulsoup selenium matplotlib flask django javascript d3 js jquery sql nosql databases postgresql mongodb hadoop mapreduce spark application engineer gobain performance plastics nj 2014 current product ptfe based bearings used primarily automotive performance bicycle applications product application development initial prototype launch within global engineering team development applications assist engineering drawing document creation automated script create based recommendati

# More Cleaning (Ugly - Fix Me)

In [10]:
res_text = res_text.replace('aacademic', 'academic')
res_text = res_text.replace('aaccounts', 'account')
res_text = res_text.replace('aaerobic', 'aerobic')
res_text = res_text.replace('aademy', 'academy')

res_text = res_text.replace('aanalyst', 'analyst')
res_text = res_text.replace('aanalyzed', 'analyst')

res_text = res_text.replace('reports', 'report')
res_text = res_text.replace('reporting', 'report')

res_text = res_text.replace('engineering', 'engineer')
res_text = res_text.replace('engineers', 'engineer')

res_text = res_text.replace('services', 'service')
res_text = res_text.replace('servicing', 'service')

res_text = res_text.replace('systems', 'system')

res_text = res_text.replace('abandoned', 'abandon')
res_text = res_text.replace('abandoner', 'abandon')
res_text = res_text.replace('abandoning', 'abandon')
res_text = res_text.replace('abandonment', 'abandon')
res_text = res_text.replace('abandons', 'abandon')

res_text = res_text.replace('management', 'manage')
res_text = res_text.replace('manager', 'manage')

res_text = res_text.replace('development', 'develop')
res_text = res_text.replace('developer', 'develop')

print(len(res_text.split()))
res_text

300


'j 3134 bryantbiggs gmail com education bachelor e received december 19 2009 program gpa 3 28 4 00 bachelor software eng expected february 2017 program gpa 4 00 4 00 experience data scientist metis 2016 current immersive 12 week data boot learn apply concepts data data mining supervised svm decision trees forests knn unsupervised nlp kmeans machine learning statistical inference bayesian statistics regression analysis data visualization computing technologies used taught python scikit learn scipy numpy pandas beautifulsoup selenium matplotlib flask django javascript d3 js jquery sql nosql databases postgresql mongodb hadoop mapreduce spark application engineer gobain performance plastics nj 2014 current product ptfe based bearings used primarily automotive performance bicycle applications product application develop initial prototype launch within global engineer team develop applications assist engineer drawing document creation automated script create based recommendation document cu

# Word Counts

In [11]:
def get_wordcount(text_list, min_ct=3, most_common=30, get_all=False):
    '''
    returns => most common
    '''
    # get wordcount counter object
    word_count = Counter(text_list)

    # remove words that occur min_ct times or less
    word_count = Counter({k:v for k, v in word_count.items() if v >= min_ct})

    if get_all:
        # return all
        word_count = word_count.items()
    else:
        # limit wordcounts for visualization
        word_count = word_count.most_common(most_common)
    
    return word_count

### Extrack Noun Phrases

In [12]:
res_nouns = TextBlob(res_text).noun_phrases
res_nouns

WordList(['bryantbiggs gmail com education bachelor e', 'program gpa', 'bachelor software eng', 'program gpa', 'experience data scientist metis', 'current immersive', 'week data boot', 'concepts data data', 'svm decision trees forests knn', 'nlp kmeans machine learning', 'statistical inference bayesian statistics regression analysis data visualization', 'python scikit', 'scipy numpy pandas beautifulsoup selenium matplotlib flask django javascript d3 js jquery sql nosql databases postgresql mongodb hadoop mapreduce', 'application engineer gobain performance plastics nj', 'current product ptfe', 'automotive performance bicycle applications product application', 'initial prototype launch', 'global engineer team', 'engineer drawing document creation', 'recommendation document customers material usage calculator inventor', 'script automate detail drawings macro', '3d parametric models', 'internal external tool', 'multiple access database', 'hr team review employee performance metrics career

### Merge Noun Phrases Back to Text String

In [13]:
temp = []
for i in res_nouns:
    temp.append(i)

res_nouns = ''
for x in temp:
    res_nouns += x + ' '
res_nouns

'bryantbiggs gmail com education bachelor e program gpa bachelor software eng program gpa experience data scientist metis current immersive week data boot concepts data data svm decision trees forests knn nlp kmeans machine learning statistical inference bayesian statistics regression analysis data visualization python scikit scipy numpy pandas beautifulsoup selenium matplotlib flask django javascript d3 js jquery sql nosql databases postgresql mongodb hadoop mapreduce application engineer gobain performance plastics nj current product ptfe automotive performance bicycle applications product application initial prototype launch global engineer team engineer drawing document creation recommendation document customers material usage calculator inventor script automate detail drawings macro 3d parametric models internal external tool multiple access database hr team review employee performance metrics career growth product engineer wyman december process models heat treatment fixtures fin

### Tokenize the documents, remove stop words and words that only appear once

In [14]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [word for word in res_nouns.split()]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
        frequency[text] += 1

# remove words that occur less than n times
texts = [token for token in texts if frequency[token] > 1]

In [15]:
str_text = ' '.join(texts)

# Load Transformation Interface

In [16]:
# load sparse vector matrix
if (os.path.exists('pkl/resume_token.mm')):
    corpus = corpora.MmCorpus('pkl/resume_token.mm')
    print('Sparse matrix LOADED as \'corpus\'')
else:
    print('Sparse matrix NOT FOUND')

Sparse matrix LOADED as 'corpus'


In [17]:
# load tokenized dictionary
if (os.path.exists('pkl/resume_token.dict')):
    dictionary = corpora.Dictionary.load('pkl/resume_token.dict')
    print('Tokenized dictionary LOADED as \'dictionary\'')
else:
    print('Tokenized dictionary NOT FOUND')

Tokenized dictionary LOADED as 'dictionary'


# Load Trained Model

In [18]:
lsi_mdl = models.LsiModel.load('pkl/lsi_mdl.lsi')

In [19]:
vec_bow = dictionary.doc2bow(texts)
vec_lsi = lsi_mdl[vec_bow] # convert the query to LSI space
print(vec_lsi)

[(0, 1.1400808987070936), (1, 0.66462049853676375), (2, -0.086946665471172441), (3, -0.20132470490451279), (4, -0.057053573572383075), (5, -0.15715645736246531), (6, -0.053012905460766535), (7, 0.085334837310623649), (8, 0.20384899948247764), (9, -0.094462004934014243), (10, 0.33054847773367119), (11, 0.33293221685077229), (12, -0.20589040353110491), (13, 0.0022485856318101823), (14, 0.21482950705197201), (15, 0.12650552541640864), (16, -0.075179024473273307), (17, 0.054494555422042207), (18, -0.072124052126883609), (19, 0.10908108256295219), (20, 0.053431851560994788), (21, 0.077374581814657784), (22, -0.28209884396980905), (23, 0.053128462536337302), (24, 0.19514019183231066), (25, -0.13951986929400334), (26, 0.26026167235649178), (27, -0.1806338176541899), (28, -0.3651136949128761), (29, 0.12832464161078413), (30, 0.28239321318959942), (31, 0.041740281442491797), (32, -0.45030988505653258), (33, 0.14682592905643618), (34, -0.11707318057360952), (35, 0.16515846290020197), (36, 0.0762

In [20]:
index = similarities.MatrixSimilarity(lsi_mdl[corpus])

In [23]:
# perform a similarity query against the corpus
sims = index[vec_lsi] 

# (document_number, document_similarity)
sim_lst = list(enumerate(sims))

# sort relevant documents first
sim_lst.sort(key=operator.itemgetter(1), reverse=True)

In [24]:
sim_lst[1:6]

[(14945, 0.80436462),
 (8007, 0.80401808),
 (9004, 0.80005157),
 (13984, 0.79582334),
 (3344, 0.77749413)]

In [25]:
df = pd.read_pickle('pkl/df_stop_noun.pkl')
df.iloc[14945]['resume_nouns']

'schlenker data scientist course director logit data detail data scientist experience r python current statistics machine learning packages previous experience mathematics engineer insurance value data analysis data scientist course director logit data januari direct class students course week bootcamp learning algorithms data weeks lecture material linear algebra probability statistics important methods machine learning students examples python code independent data projects data scientist nyc data september r python current algorithms software packages data scientists beautiful soup analyze data b h photo video website shiny app visualize data analysis scorecard dataset teaching assistant augustus semester calculus traditional classroom learning mathematica software curriculum mathematica calculus lab exercises mathematics mei bachelor mathematics juni vaardigheden python numpy scipy pandas matlab rstudio sql scikit linux matlab matplotlib java mathematica aanvullende informatie tech