# Vector Spaces

In [67]:
import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import gensim
from gensim import corpora, models, similarities
from nltk.corpus import stopwords

from collections import defaultdict
from pprint import pprint
from six import iteritems
import os

import numpy as np
import pandas as pd
import scipy.sparse

## Load Processed Dataframe

In [68]:
df = pd.read_json('data/md_contents.json')
df.head()

Unnamed: 0,file_contents
0,What is this Python project Describe features ...
1,Contributing Your contributions are always wel...
2,Awesome Python Awesome A curated list of aweso...
3,Contribution Guidelines Before opening any iss...
4,Contributor Code of Conduct As contributors an...


## Convert Series to List of Strings

In [69]:
contents = df['file_contents'].tolist()
contents[:1]

['What is this Python project Describe features What s the difference between this Python project and similar ones Enumerate comparisons Anyone who agrees with this pull request could vote for it by adding a to it and usually the maintainer will merge it when votes reach']

# From Strings to Vectors

### Tokenize the documents, remove stop words and words that only appear once

In [194]:
# remove common words and tokenize
stoplist = set(stopwords.words('english'))

texts = [[word.lower() for word in content.split()if word.lower() not in stoplist] for content in contents]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# remove words that occur less than n times
texts = [[token for token in text if frequency[token] > 3] for text in texts]
len(texts)

6283

### Save Token Count Dictionary to File

In [195]:
dictionary = corpora.Dictionary(texts)

# store the dictionary, for future reference
dictionary.save('data/text_token.dict')
print(dictionary)

Dictionary(24712 unique tokens: ['connector', 'mattdesl', 'hdf', 'codrops', 'pgdata']...)


### Convert Tokenized Resumes to Vectors

In [196]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('data/text_token.mm', corpus)  # store to disk, for later use
for c in corpus[:1]:
    print(c)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)]


# Transformation Interface

In [197]:
# load tokenized dictionary
if (os.path.exists('data/text_token.dict')):
    dictionary = corpora.Dictionary.load('data/text_token.dict')
    print('Tokenized dictionary LOADED as \'dictionary\'')
else:
    print('Tokenized dictionary NOT FOUND')

Tokenized dictionary LOADED as 'dictionary'


In [198]:
# load sparse vector matrix
if (os.path.exists('data/text_token.mm')):
    corpus = corpora.MmCorpus('data/text_token.mm')
    print('Sparse matrix LOADED as \'corpus\'')
else:
    print('Sparse matrix NOT FOUND')

Sparse matrix LOADED as 'corpus'


### TF-IDF Transformation

In [201]:
# step 1 -- initialize a model
tfidf_mdl = models.TfidfModel(corpus) 

Calling `model[corpus]` only creates a wrapper around the old corpus document stream – actual conversions are done on-the-fly, during document iteration. We cannot convert the entire corpus at the time of calling corpus_transformed = model[corpus], because that would mean storing the result in main memory, and that contradicts gensim’s objective of memory-indepedence. If you will be iterating over the transformed corpus_transformed multiple times, and the transformation is costly, serialize the resulting corpus to disk first and continue using that.

In [202]:
# step 2 -- use the model to transform vectors
corpus_tfidf = tfidf_mdl[corpus]
print(len(corpus_tfidf))

# view one resume
for doc in corpus_tfidf[:1]:
    print(doc)

6283
[(0, 0.18246551926719032), (1, 0.2270398016185427), (2, 0.19115802241083493), (3, 0.32424049625629037), (4, 0.2890335590555953), (5, 0.3215947393368704), (6, 0.19924725453495884), (7, 0.11147295903409228), (8, 0.39548953276357063), (9, 0.1313466860729142), (10, 0.21634460975497669), (11, 0.27052564249680583), (12, 0.14138878072706235), (13, 0.11714476712623356), (14, 0.1413094453050044), (15, 0.14304380102152006), (16, 0.15635387384423827), (17, 0.20734585403870268), (18, 0.17836280430264478), (19, 0.22853002270124395), (20, 0.12297155708721454)]


In [209]:
from sklearn.feature_extraction.text import TfidfVectorizer
n_features = 1500

tfidf_vec = TfidfVectorizer(input='content', ngram_range=(1, 3), max_df=0.85, min_df=0.05, 
                max_features=n_features, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

tfidf_vec_prep = tfidf_vec.fit_transform(resumes)

In [210]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1, n_jobs=-1)

km_mdl = km.fit_predict(tfidf_vec_prep)

In [211]:
len(km_mdl)

6375

In [212]:
# Determine your k range
k_range = range(1,20)

# fit the kmeans model for each n_clusters = k
k_means_var = [KMeans(n_clusters=k).fit(tfidf_vec_prep) for k in k_range]

# pull out the cluster centers for each model
centroids = [X.cluster_centers_ for X in k_means_var]

In [213]:
from scipy.spatial.distance import cdist, pdist

# calculate the euclidean distance from each point to each cluster center
k_euclid = [cdist(tfidf_vec_prep.toarray(), cent, 'euclidean') for cent in centroids]
dist = [np.min(ke, axis=1) for ke in k_euclid]

# total within-cluster sum of squares
wcss = [sum(d**2) for d in dist]

# the total sum of squares
tss = sum(pdist(tfidf_vec_prep.toarray())**2)/tfidf_vec_prep.shape[1]

# the between-cluster sum of squares
bss = tss - wcss

In [214]:
import seaborn as sns
sns.set_style("white")
sns.set_context("poster", font_scale=1.25, rc={"lines.linewidth": 2.5})
sns.set_palette("Set2")
colors = sns.color_palette("BrBG", 5)

In [215]:
# make figure
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(111)

# color
colors = sns.color_palette("BrBG", 5)

# plots
ax.plot(K, avgWithinSS, marker='o', color=colors[-1], alpha=0.5)

# labels/titles
plt.legend(loc="best")
plt.title('Elbow for K-Means')
plt.xlabel('Number of Clusters')
plt.ylabel('Avg. Within-Cluster Sum of Squares')

# remove border
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)

# show grid
ax.xaxis.grid(True, alpha=0.2) 
ax.yaxis.grid(True, alpha=0.2) 

# plot that biddy
plt.savefig('data/{0}.png'.format('KMeans_elbow'), bbox_inches='tight')
plt.close(fig)



In [216]:
import numpy as np
from scipy.cluster.vq import kmeans
from scipy.spatial.distance import cdist,pdist
from sklearn import datasets
from sklearn.decomposition import RandomizedPCA
from matplotlib import pyplot as plt
from matplotlib import cm

# perform PCA dimensionality reduction
pca = RandomizedPCA(n_components=2).fit(tfidf_vec_prep.toarray())
X = pca.transform(tfidf_vec_prep.toarray())

##### cluster data into K=1..20 clusters #####
K_MAX = 20
KK = range(1,K_MAX+1)

KM = [kmeans(X,k) for k in KK]
centroids = [cent for (cent,var) in KM]
D_k = [cdist(X, cent, 'euclidean') for cent in centroids]
cIdx = [np.argmin(D,axis=1) for D in D_k]
dist = [np.min(D,axis=1) for D in D_k]

tot_withinss = [sum(d**2) for d in dist]  # Total within-cluster sum of squares
totss = sum(pdist(X)**2)/X.shape[0]       # The total sum of squares
betweenss = totss - tot_withinss          # The between-cluster sum of squares

##### plots #####
kIdx = 4        # K=10
clr = cm.spectral( np.linspace(0,1,10) ).tolist()
mrk = 'os^p<dvh8>+x.'

In [217]:
# make figure
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(111)

# color
colors = sns.color_palette("BrBG", 5)

# plots
ax.plot(KK, betweenss/totss*100, marker='o', color=colors[-1], alpha=0.5)
ax.plot(KK[kIdx], betweenss[kIdx]/totss*100, marker='o', markersize=25, color=colors[0], alpha=0.5)

# labels/titles
plt.legend(loc="best")
plt.title('Elbow for KMeans Clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Percentage of variance explained (%)')

ax.set_xlim((-0.1,20.5))
ax.set_ylim((-0.5,100))

# remove border
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)

# show grid
ax.xaxis.grid(True, alpha=0.2) 
ax.yaxis.grid(True, alpha=0.2) 

# plot that biddy
plt.savefig('data/{0}.png'.format('KMeans_elbow_var'), bbox_inches='tight')
plt.close(fig)



In [218]:
# make figure
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(111)

# plots
for i in range(kIdx+1):
    ind = (cIdx[kIdx]==i)
    ax.scatter(X[ind,0],X[ind,1], s=65, c=colors[i], marker=mrk[i], 
               label='Cluster {0}'.format(i), alpha=1)

# labels/titles
plt.legend(loc='upper right')
plt.title('K={0} Clusters'.format(KK[kIdx]))

#ax.set_xlim((-.5,.5))
#ax.set_ylim((-.3,.81))

# remove border
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)

# show grid
ax.xaxis.grid(True, alpha=0.2) 
ax.yaxis.grid(True, alpha=0.2) 

# plot that biddy
plt.savefig('data/{0}.png'.format('KMeans_{0}_clusters'.format(KK[kIdx])), bbox_inches='tight')
plt.close(fig)

# Latent Semantic Indexing Topics

In [62]:
num_topics = 100

# initialize an LSI transformation
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
corpus_lsi = lsi[corpus_tfidf]

In [63]:
# the topics are printed to log
a = lsi.print_topics(8)
a[0]

(0,
 '0.857*"|" + 0.187*"docker" + 0.078*"container" + 0.075*"metrics" + 0.071*"=" + 0.063*">>>" + 0.054*"<" + 0.054*">" + 0.054*"false" + 0.052*"<br"')

In [19]:
for doc in corpus_lsi[800]: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    pass
    #print(doc)

# Model Save & Load

In [20]:
lsi.save('pkl/lsi_mdl.lsi')
lsi = models.LsiModel.load('pkl/lsi_mdl.lsi')

# LDA Topics

In [64]:
lda_mdl = models.LdaModel(corpus, id2word=dictionary, num_topics=20)

In [65]:
lda_mdl.top_topics
pprint(lda_mdl.print_topics(10))

[(9,
  '0.061*brooklyn + 0.015*the + 0.009*apache + 0.007*file + 0.007*license + '
  '0.006*= + 0.006*data + 0.005*example + 0.005*this + 0.005*use'),
 (6,
  '0.230*| + 0.065*aurora + 0.028*> + 0.022*<br + 0.017*f + 0.007*mesos + '
  '0.007*s + 0.006*orderedlist + 0.005*users + 0.004*scheduler'),
 (3,
  '0.014*git + 0.009*apache + 0.008*task + 0.007*branch + 0.007*new + '
  '0.007*code + 0.007*project + 0.007*the + 0.006*create + 0.006*if'),
 (18,
  '0.068*% + 0.009*data + 0.007*the + 0.007*use + 0.007*> + 0.006*using + '
  '0.005*site + 0.005*$ + 0.005*html + 0.005*this'),
 (4,
  '0.023*> + 0.017*br + 0.011*% + 0.009*= + 0.008*dec + 0.008*web + 0.007*py + '
  '0.007*mysql + 0.007*bind + 0.007*nginx'),
 (14,
  '0.016*– + 0.012*false + 0.011*default + 0.010*the + 0.010*| + 0.009*server '
  '+ 0.008*type + 0.008*docker + 0.008*example + 0.007*='),
 (15,
  '0.087*< + 0.072*td> + 0.030*tr> + 0.026*<tr> + 0.017*<td> + 0.013*= + '
  '0.010*> + 0.009*the + 0.007*<td><a + 0.007*class='),
 (19,

In [116]:
print(corpus)

MmCorpus(17049 documents, 42606 features, 3131686 non-zero entries)


In [117]:
doc = df.iloc[0]['resume_nouns']
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

[(0, 2.374975010869965), (1, 0.51728887522253952), (2, -0.058935199530753268), (3, 0.3578493537974749), (4, 1.560417648600577), (5, -1.9931029846659232), (6, 0.58697139609914861), (7, 1.437193124041608), (8, -0.38633595032575146), (9, -2.3068352804125016), (10, 0.77482570234627612), (11, -0.66082521176920128), (12, -2.0221618401059822), (13, 1.3229424544863675), (14, -0.29408524037515837), (15, -1.0569710323996966), (16, 1.110889840043604), (17, 1.3434022602282594), (18, -0.095802335904933394), (19, -0.80089048085959047), (20, -0.64832039201675884), (21, 1.35059095621303), (22, 0.36313071163680766), (23, 0.23008512654094881), (24, -1.4704302056681957), (25, -0.51110545886820391), (26, 1.5065962351771218), (27, -0.85864630999976976), (28, -0.27005311330166226), (29, 1.3357001963834654), (30, 0.11920370036201439), (31, 0.20935482520268536), (32, 0.58140672694418549), (33, 0.86476990150558442), (34, 0.21906262257842274), (35, 1.2623527033747142), (36, 0.47122700487966684), (37, 0.14754992

# Cosine Similarity

In [118]:
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

In [120]:
index.save('pkl/resume_stopped.index')
index = similarities.MatrixSimilarity.load('pkl/resume_stopped.index')

In [138]:
sims = index[vec_lsi] # perform a similarity query against the corpus

# (document_number, document_similarity)
sim_lst = list(enumerate(sims))

In [139]:
import operator
sim_lst.sort(key=operator.itemgetter(1), reverse=True)

In [141]:
# comparing resumes within resumes
sim_lst[1:6]

[(537, 0.9609338),
 (468, 0.95680636),
 (39, 0.95674884),
 (189, 0.95360476),
 (737, 0.94994313)]

In [144]:
' '.join(texts[0])

'engineer structural engineer december nonstructural equipment hospitals accordance asce cbc local codes extensive knowledge experience engineer programs enercalc etabs hilti profis remodel buildings beams columns foundations physical work remodel ensure work civil engineer student worker department public works september engineer meet publics needs transportation infrastructure project engineer project manages geographic presentation data gis system engineer report documents fund multimillion projects microsoft word access multiple projects coordination disaster reimbursement civil engineer'