In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from pymongo import MongoClient
from time import time
from collections import Counter
from textblob import TextBlob
import pandas as pd
import numpy as np
import re
import os
import pickle

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# default plot stying changes
import seaborn as sns
sns.set_style("white")
sns.set_context("poster", font_scale=1.25, rc={"lines.linewidth": 2.5})
sns.set_palette("Set2")
colors = sns.color_palette('Set2',12)

# Credentials

In [2]:
pw_file = 'credentials/pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        email, indeed_pw = f.readline().strip().split(', ')
        username, pia_pw = f.readline().strip().split(', ')
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to DB

In [3]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

In [4]:
# get reference to  resume_db
db = client.resume_db

In [5]:
# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

True

# Pull MongoDB into Dataframe

In [6]:
def read_mongo(db, collection, query={}, no_id=True):
    '''
    db: mongodb already connected and authenticated
    collection: desired collection in db
    query: query filter
    no_id: include mongos _id (False) or not (True)
    return => pandas dataframe
    '''
    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [7]:
t_start = time()

# load database data into dataframe
df = read_mongo(db, 'originals')

print('Time to load data: {0}s'.format(time() - t_start))

Time to load data: 7.36710000038147s


# City, State Abbreviation List

In [8]:
with open(r'pkl/cities.pkl', 'rb') as infile:
       cities = pickle.load(infile)

print(len(cities))

10066


In [9]:
with open(r'pkl/abbr.pkl', 'rb') as infile:
       abbr = pickle.load(infile)

print(len(abbr))

50


# Second Pass - Clean Text

In [10]:
df = df[['resume_text']]
df['resume_stopped'] = df['resume_text'].str.replace(r'''[^0-9a-zA-Z ]+''', '')
df.head(3)

Unnamed: 0,resume_text,resume_stopped
0,"Petros Gazazyan North Hollywood, CA Werkervari...",Petros Gazazyan North Hollywood CA Werkervarin...
1,"Travis London Java Software Engineer Tucson, A...",Travis London Java Software Engineer Tucson AZ...
2,"Stephen A. Kraft Mechanical Engineer Seattle, ...",Stephen A Kraft Mechanical Engineer Seattle WA...


# Remove StopWords

In [13]:
# cache stopwords first to reduce compute time
cachedStopWords = stopwords.words("english")
cachedStopWords += ['tot']
cachedStopWords += cities
cachedStopWords += abbr
cachedStopWords = list(set(cachedStopWords))

In [14]:
# convert all text to lower case and separate into list
df['resume_stopped'] = df['resume_stopped'].str.lower().str.split()

# remove stopwords
df['resume_stopped'] = df['resume_stopped'].apply(lambda x: ' '.join([item for item in x if item not in cachedStopWords]))

# Save Dataframe to Pickle

In [15]:
df.to_pickle('pkl/df_stop.pkl')
df = None

# Read Dataframe from Pickle (RESTART)

In [16]:
df = pd.read_pickle('pkl/df_stop.pkl')
df.head(3)

Unnamed: 0,resume_text,resume_stopped
0,"Petros Gazazyan North Hollywood, CA Werkervari...",petros gazazyan werkervaring engineer structur...
1,"Travis London Java Software Engineer Tucson, A...",java software engineer bereid overal naartoe t...
2,"Stephen A. Kraft Mechanical Engineer Seattle, ...",kraft mechanical engineer bereid overal naarto...


# Total Word Count

In [17]:
text_ct = ' '.join(df['resume_text'].tolist()).split()
len(text_ct)

12974248

In [18]:
stop_ct = ' '.join(df['resume_stopped'].tolist()).split()
len(stop_ct)

8630319

In [125]:
df['resume_stopped'] = df['resume_stopped'].str.replace('aacademic', 'academic')
df['resume_stopped'] = df['resume_stopped'].str.replace('aaccounts', 'account')
df['resume_stopped'] = df['resume_stopped'].str.replace('aaerobic', 'aerobic')
df['resume_stopped'] = df['resume_stopped'].str.replace('aademy', 'academy')

df['resume_stopped'] = df['resume_stopped'].str.replace('aanalyst', 'analyst')
df['resume_stopped'] = df['resume_stopped'].str.replace('aanalyzed', 'analyst')

df['resume_stopped'] = df['resume_stopped'].str.replace('reports', 'report')
df['resume_stopped'] = df['resume_stopped'].str.replace('reporting', 'report')

df['resume_stopped'] = df['resume_stopped'].str.replace('engineering', 'engineer')
df['resume_stopped'] = df['resume_stopped'].str.replace('engineers', 'engineer')

df['resume_stopped'] = df['resume_stopped'].str.replace('services', 'service')
df['resume_stopped'] = df['resume_stopped'].str.replace('servicing', 'service')

df['resume_stopped'] = df['resume_stopped'].str.replace('systems', 'system')

df['resume_stopped'] = df['resume_stopped'].str.replace('abandoned', 'abandon')
df['resume_stopped'] = df['resume_stopped'].str.replace('abandoner', 'abandon')
df['resume_stopped'] = df['resume_stopped'].str.replace('abandoning', 'abandon')
df['resume_stopped'] = df['resume_stopped'].str.replace('abandonment', 'abandon')
df['resume_stopped'] = df['resume_stopped'].str.replace('abandons', 'abandon')

df['resume_stopped'] = df['resume_stopped'].str.replace('management', 'manage')
df['resume_stopped'] = df['resume_stopped'].str.replace('manager', 'manage')

df['resume_stopped'] = df['resume_stopped'].str.replace('development', 'develop')
df['resume_stopped'] = df['resume_stopped'].str.replace('developer', 'develop')

In [126]:
_ = '''
from nltk.corpus import words

unique_wrds = sorted(set([x for x in stop_ct if x.isalpha()]))
print(len(unique_wrds))
non_en = []

for wd in unique_wrds[:200]:
    if wd not in words.words():
        non_en.append(wd)
'''

# Stems (RESTART)

In [127]:
# if stemmed list already exists, load it
if os.path.isfile('pkl/port_stem.pkl'):
    with open(r'pkl/port_stem.pkl', 'rb') as infile:
       port_stem = pickle.load(infile)
else:
# otherwise make the stemmed list
    text = ' '.join(df['resume_stopped'].tolist())
    
    port_stem = []
    stemmer = PorterStemmer()

    for word in TextBlob(text).words:
        port_stem.append(stemmer.stem(word))

In [128]:
# if stemmed list already exists, load it
if os.path.isfile('pkl/lanc_stem.pkl'):
    with open(r'pkl/lanc_stem.pkl', 'rb') as infile:
       lanc_stem = pickle.load(infile)
else:
# otherwise make the stemmed list
    text = ' '.join(df['resume_stopped'].tolist())
    
    lanc_stem = []
    stemmer = LancasterStemmer()

    for word in TextBlob(text).words:
        lanc_stem.append(stemmer.stem(word))

In [129]:
print(len(set(port_stem)))
print(len(set(lanc_stem)))

126305
115243


### Pickle Stemmed Words

In [130]:
def save_pkl(data, filename):
    with open('{0}.pkl'.format(filename), 'wb') as f:
        pickle.dump(data, f)

In [131]:
save_pkl(port_stem, 'pkl/port_stem')
save_pkl(lanc_stem, 'pkl/lanc_stem')

# Word Counts

In [132]:
def get_wordcount(text_list, min_ct=3, most_common=30, get_all=False):
    '''
    returns => most common
    '''
    # get wordcount counter object
    word_count = Counter(text_list)

    # remove words that occur min_ct times or less
    word_count = Counter({k:v for k, v in word_count.items() if v >= min_ct})

    if get_all:
        # return all
        word_count = word_count.items()
    else:
        # limit wordcounts for visualization
        word_count = word_count.most_common(most_common)
    
    return word_count

### Porter Stemmed Word Count

In [133]:
wordct_port_stem = get_wordcount(port_stem, 3, get_all=True)

### Lancaster Stemmed Word Count

In [134]:
wordct_lanc_stem = get_wordcount(lanc_stem, 3, get_all=True)

### Un-Stemmed Word Count

In [135]:
txt = ' '.join(df['resume_stopped']).split(' ')
wordct = get_wordcount(txt, 3, get_all=True)

### Get Lables, Counts of Word Counts

In [136]:
def label_count(word_count):
    label = [lbl for lbl, ct in word_count]
    count = [ct for lbl, ct in word_count]
    return (label, count)

# Word Count Plots

In [137]:
def plot_bar(data_tup, title, file_name):
    # make figure
    fig = plt.figure(figsize=(20,12))
    ax = fig.add_subplot(111)

    ########## DATA ##############
    lbl, ct = label_count(data_tup)
    ##############################

    # color
    colors = sns.color_palette("BrBG", len(lbl))

    # plots
    y_pos = np.arange(len(lbl))
    ax.barh(y_pos, ct, align='center', color=colors, edgecolor=colors)

    #plt.xlim(0,170000)
    plt.ylim(-0.5,len(lbl))

    # labels/titles
    plt.legend(loc="best")
    plt.title('{0} Word/Term Frequency'.format(title))
    plt.xlabel('Word/Term Count')
    plt.yticks(y_pos, lbl)
    plt.ylabel('Word/Term')
    plt.xticks(np.linspace(0,180000, 13))

    # remove border
    ax.spines["top"].set_visible(False)
    ax.spines["bottom"].set_alpha(0.2)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_alpha(0.2)

    # plot that biddy
    plt.savefig('data/pics/{0}.png'.format(file_name), bbox_inches='tight')
    plt.close(fig)
    
    return None

### Save as Images

In [138]:
wordct_port_stem_plt = get_wordcount(port_stem, 3, 30)
wordct_lanc_stem_plt = get_wordcount(lanc_stem, 3, 30)
wordct_plt = get_wordcount(txt, 3, 30)


plot_bar(wordct_port_stem_plt, 'Porter Stem', 'porter_bar')
plot_bar(wordct_lanc_stem_plt, 'Lancaster Stem', 'lancaster_bar')
plot_bar(wordct_plt, 'Non-Stemmed', 'non-stem_bar')



# Nouns

### Extrack Noun Phrases

In [139]:
nouns = lambda x: TextBlob(x).noun_phrases

df['resume_nouns'] = df['resume_stopped']
df['resume_nouns'] = df['resume_nouns'].apply(nouns)

### Merge Noun Phrases Back to Text String

In [140]:
lst_to_str = lambda x: ' '.join(x)

df['resume_nouns'] = df['resume_nouns'].apply(lst_to_str)

# Save Dataframe to Pickle

In [142]:
df.to_pickle('pkl/df_stop_noun.pkl')
df = None

# Read Dataframe from Pickle (RESTART)

In [143]:
df = pd.read_pickle('pkl/df_stop_noun.pkl')
df.head(3)

Unnamed: 0,resume_text,resume_stopped,resume_nouns
0,"Petros Gazazyan North Hollywood, CA Werkervari...",petros gazazyan werkervaring engineer structur...,petros gazazyan engineer structural ttg engine...
1,"Travis London Java Software Engineer Tucson, A...",java software engineer bereid overal naartoe t...,java software engineer bereid overal naartoe t...
2,"Stephen A. Kraft Mechanical Engineer Seattle, ...",kraft mechanical engineer bereid overal naarto...,mechanical engineer bereid overal naartoe te v...


### Noun Phrased Word Count

In [144]:
noun_txt = ' '.join(df['resume_nouns']).split(' ')
wordct_noun = get_wordcount(noun_txt, 2, get_all=True)

### Output Frequency Chart of Noun Phrases

In [145]:
wordct_noun_plt = get_wordcount(noun_txt, 2, 30)
plot_bar(wordct_noun_plt, 'Noun Phrases', 'noun_bar')



## Counter (Tuple) to Label List

In [146]:
def count_lbl_lst(cntr):
    lst = [x for x,y in cntr]
    return lst

# N-Grams Count Vectorizer

In [147]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [148]:
t_start = time()

# convert resume texts to a sparse matrix of token counts
ct_vect = CountVectorizer(ngram_range=(1, 4), max_df=0.90, min_df=2, max_features=n_features, stop_words='english')
ct_vect_prep = ct_vect.fit_transform(df['resume_nouns'])

print('Time: {0:.4}s'.format(time() - t_start))

Time: 99.49s


# Latent Dirichlet Allocation

In [149]:
lda_mdl = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', 
                                learning_offset=50., random_state=0)

t_start = time()

lda_mdl.fit(ct_vect_prep)

print('Time: {0:.4}s'.format(time() - t_start))

Time: 45.8s


In [150]:
save_pkl(lda_mdl, 'pkl/lda_mdl')

In [151]:
# get feature names (topics) from model
feat_names = ct_vect.get_feature_names()

print('Length of feature set: {0}'.format(len(feat_names)))
print('Start of list: ' + ', '.join(feat_names[:10]))
print('End of list: ' + ', '.join(feat_names[-10:]))

Length of feature set: 1000
Start of list: 10g, 11g, 9i, aanvullende, aanvullende informatie, aanvullende informatie skills, aanvullende informatie technical, aanvullende informatie technical skills, ability, able
End of list: workflow, workflows, works, worldwide, wwwlinkedincom, xml, xp, year, years, years experience


# Get Top Words in Topics

In [152]:
def print_top_words(model, feature_names, top_words):
    for i, topic in enumerate(model.components_):
        print("Topic {0}:".format(i))
        for wd in topic.argsort()[:-top_words - 1:-1]:
            print('\t{0}'.format(feature_names[wd]))
    print()

In [153]:
print_top_words(lda_mdl, feat_names, 10)

Topic 0:
	data
	software
	java
	web
	engineer
	python
	hadoop
	application
	analysis
	technologies
Topic 1:
	project
	test
	manage
	requirements
	team
	quality
	process
	technical
	report
	software
Topic 2:
	manage
	analysis
	sales
	report
	team
	service
	project
	product
	senior
	process
Topic 3:
	engineer
	network
	support
	manage
	security
	software
	technical
	service
	hardware
	maintenance
Topic 4:
	analysis
	sas
	web
	report
	google
	data
	digital
	customer
	manage
	online
Topic 5:
	data
	service
	customer
	skills
	entry
	office
	customer service
	data entry
	manage
	information
Topic 6:
	data
	analysis
	report
	analyst
	etl
	data analysis
	sql
	bi
	intelligence
	data analyst
Topic 7:
	data
	hadoop
	hive
	java
	experience
	pig
	hdfs
	web
	application
	service
Topic 8:
	sql
	server
	database
	sql server
	report
	data
	manage
	environment
	service
	application
Topic 9:
	analysis
	scientist
	data
	laboratory
	skills
	clinical
	cell
	engineer
	lab
	te



# TF-IDF

In [154]:
from sklearn.feature_extraction.text import TfidfVectorizer

t_start = time()

tfidf_vec = TfidfVectorizer(input='content', ngram_range=(1, 3), max_df=0.9, min_df=2, 
                max_features=n_features, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

tfidf_vec_prep = tfidf_vec.fit_transform(df['resume_nouns'])

print('Time: {0:.4}s'.format(time() - t_start))

Time: 68.08s


In [155]:
lda_mdl = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', 
                                learning_offset=50., random_state=0)

t_start = time()

lda_mdl.fit(tfidf_vec_prep)

print('Time: {0:.4}s'.format(time() - t_start))

Time: 19.28s


In [156]:
save_pkl(lda_mdl, 'pkl/lda_mdl_tfidf')

In [157]:
# get feature names (topics) from model
feat_names = tfidf_vec.get_feature_names()

print('Start of list: ' + ', '.join(feat_names[:10]))
print('End of list: ' + ', '.join(feat_names[-10:]))

Start of list: 10g, 11g, 9i, aanvullende, aanvullende informatie, aanvullende informatie skills, aanvullende informatie technical, ability, able, academic
End of list: workflow, workflows, works, worldwide, wwwlinkedincom, xml, xp, year, years, years experience


# Get Top Words in Topics

In [158]:
print_top_words(lda_mdl, feat_names, 10)

Topic 0:
	engineer
	sales
	electrical
	applications
	product
	scenarios
	senior
	designs
	commercial
	release
Topic 1:
	analysis
	manage
	data
	report
	sales
	project
	analyst
	team
	product
	system
Topic 2:
	scientist
	laboratory
	data
	analysis
	cell
	chemistry
	lab
	environmental
	engineer
	clinical
Topic 3:
	engineer
	network
	maintenance
	system
	electrical
	experience
	project
	mechanical
	manage
	repair
Topic 4:
	engineer
	test
	equipment
	mechanical
	system
	quality
	analysis
	validation
	machine
	production
Topic 5:
	engineer
	equipment
	electrical
	network
	mechanical
	technician
	maintenance
	system
	repair
	installation
Topic 6:
	engineer
	electronics
	fund
	system
	project
	experience
	maintenance
	computer
	test
	communication
Topic 7:
	hadoop
	hive
	pig
	java
	hdfs
	hbase
	sqoop
	mapreduce
	data
	oozie
Topic 8:
	data
	sql
	system
	server
	manage
	database
	project
	software
	analysis
	report
Topic 9:
	entry
	data entry
	data
	customer
	service
	customer service
	skills
	