# Vector Spaces

In [1]:
import numpy as np
import pandas as pd

import gensim
from gensim import corpora, models, similarities

import nltk
from nltk import pos_tag
from nltk.corpus import stopwords

import re
import os
import json
from textblob import TextBlob
from collections import Counter, OrderedDict, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

## Load Processed Dataframe

In [2]:
df_1 = pd.read_json('data/md_contents.json')
df_2 = pd.read_json('data/rst_contents.json')
print(len(df_1))
df_1.head()

8495


Unnamed: 0,file_contents
0,What is this Python project Describe features ...
1,Contributing Your contributions are always wel...
2,Awesome Python Awesome A curated list of aweso...
3,Contribution Guidelines Before opening any iss...
4,Contributor Code of Conduct As contributors an...


In [3]:
print(len(df_2))
df_2.head()

19374


Unnamed: 0,file_contents
0,Requests is written and maintained by Kenneth ...
1,changelog Release History Bugfixes Fixed a bug...
2,If you are planning to submit a pull request t...
3,api Developer Interface module requests This p...
4,faq Frequently Asked Questions This part of th...


In [4]:
frames = [df_1, df_2]

df = pd.concat(frames)
print(len(df))
df.head()

27869


Unnamed: 0,file_contents
0,What is this Python project Describe features ...
1,Contributing Your contributions are always wel...
2,Awesome Python Awesome A curated list of aweso...
3,Contribution Guidelines Before opening any iss...
4,Contributor Code of Conduct As contributors an...


# Extract just nouns/noun phrases

In [5]:
def only_nouns(sentence):
    sentence = nltk.word_tokenize(sentence)
    sent = pos_tag(sentence)
    return ' '.join([s[0].lower() for s in sent if s[1] in ['NN', 'NNP']])

In [6]:
nouns = lambda x: only_nouns(x)

df['file_nouns'] = df['file_contents']
df['file_nouns'] = df['file_nouns'].apply(nouns)

In [7]:
df.head()

Unnamed: 0,file_contents,file_nouns
0,What is this Python project Describe features ...,python project describe difference python proj...
1,Contributing Your contributions are always wel...,guidelines add link pull request add link proj...
2,Awesome Python Awesome A curated list of aweso...,awesome python awesome a list python framework...
3,Contribution Guidelines Before opening any iss...,contribution guidelines read contributor s gui...
4,Contributor Code of Conduct As contributors an...,contributor code conduct project interest comm...


## Convert Series to List of Strings

In [8]:
contents = df['file_nouns'].tolist()
contents[1]

'guidelines add link pull request add link project name a description period keep add section section description add section title table contents search yours don t mention python description check spelling grammar remove whitespace send pull request reason library'

In [9]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

In [10]:
def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    
    # filter words shorter than 2 chars and words containing chars other than letters
    for token in tokens:
        if len(token) > 2:
            if re.search('[a-zA-Z]', token):
                if token not in stopwords:
                    filtered_tokens.append(token)
    return filtered_tokens

In [11]:
vocab_tokenized = []
for doc in contents:
    doc_tokens = tokenize(doc)
    vocab_tokenized.extend(doc_tokens)

In [12]:
print(len(vocab_tokenized))
print(len(set(vocab_tokenized)))

4740829
118803


In [13]:
vocab_frame = pd.DataFrame({'words': vocab_tokenized}, index = vocab_tokenized)
print('{0} words in tokenized corpus'.format(len(vocab_tokenized)))

4740829 words in tokenized corpus


In [14]:
vocab_frame.head()

Unnamed: 0,words
python,python
project,project
describe,describe
difference,difference
python,python


In [15]:
wc = Counter(vocab_tokenized)

y = OrderedDict(wc.most_common()[::-1])
len(y)

118803

In [16]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.98, max_features=1500, min_df=0.01, stop_words='english',
                                 use_idf=True, tokenizer=tokenize, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(contents)

print(tfidf_matrix.shape)

CPU times: user 1min 29s, sys: 1.48 s, total: 1min 31s
Wall time: 1min 31s
(27869, 1132)


In [17]:
terms = tfidf_vectorizer.get_feature_names()

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [19]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5)
%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

CPU times: user 38.8 s, sys: 281 ms, total: 39 s
Wall time: 39.3 s


In [20]:
from sklearn.externals import joblib

#uncomment the below to save your model 
joblib.dump(km,  'pkl/doc_cluster.pkl')

km = joblib.load('pkl/doc_cluster.pkl')
clusters = km.labels_.tolist()

In [21]:
data = { 'contents': contents, 'cluster': clusters }
frame = pd.DataFrame(data, index = [clusters] , columns = ['contents', 'cluster'])

In [22]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
order_centroids

array([[1083,  346,  469, ...,  248,  678,  739],
       [  66,  497,  657, ...,  631,  630,    0],
       [ 786,  154,  385, ..., 1084, 1086,   67],
       [ 377, 1085, 1083, ...,  917,  464,  371],
       [ 868,   67,  869, ...,  615,  295,  976]])

In [23]:
print("Top terms per cluster:")
print()

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(5):
    print("Cluster {0}:".format(i), end='')
    for ind in order_centroids[i, :8]: #replace 8 with n words per cluster
        print(' %s ' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print('\n')

Top terms per cluster:

Cluster 0: b'vim' , b'event' , b'host' , b'dynamicdata' , b'attributes' , b'vmodl' , b'device' , b'api' ,

Cluster 1: b'automodule' , b'inheritance' , b'module' , b'module' , b'mod' , b'package' , b'import' , b'autoclass' ,

Cluster 2: b'python' , b'code' , b'file' , b'class' , b'docker' , b'example' , b'image' , b'import' ,

Cluster 3: b'fault' , b'vim' , b'vim' , b'str' , b'str' , b'host' , b'attributes' , b'machine' ,

Cluster 4: b'salt' , b'automodule' , b'salt' , b'automodule' , b'module' , b'module' , b'cloud' , b'master' ,



# From Strings to Vectors

### Tokenize the documents, remove stop words and words that only appear once

In [24]:
# remove common words and tokenize
from nltk.corpus import stopwords
stoplist = set(stopwords.words('english'))

In [25]:
texts = [[word.lower() for word in content.split()if word.lower() not in stoplist] for content in contents]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# remove words that occur less than n times
texts = [[token for token in text if frequency[token] > 3] for text in texts]
len(texts)

27869

### Save Token Count Dictionary to File

In [26]:
dictionary = corpora.Dictionary(texts)

# store the dictionary, for future reference
dictionary.save('data/text_token.dict')
print(dictionary)

Dictionary(41107 unique tokens: ['giza', 'schofield', 'sshfp', 'composewith', 'kwlist']...)


### Convert Tokenized Resumes to Vectors

In [27]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('data/text_token.mm', corpus)  # store to disk, for later use
for c in corpus[:1]:
    print(c)

[(0, 2), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]


# Transformation Interface

In [28]:
# load tokenized dictionary
if (os.path.exists('data/text_token.dict')):
    dictionary = corpora.Dictionary.load('data/text_token.dict')
    print('Tokenized dictionary LOADED as \'dictionary\'')
else:
    print('Tokenized dictionary NOT FOUND')

Tokenized dictionary LOADED as 'dictionary'


In [29]:
# load sparse vector matrix
if (os.path.exists('data/text_token.mm')):
    corpus = corpora.MmCorpus('data/text_token.mm')
    print('Sparse matrix LOADED as \'corpus\'')
else:
    print('Sparse matrix NOT FOUND')

Sparse matrix LOADED as 'corpus'


### TF-IDF Transformation

In [30]:
# step 1 -- initialize a model
tfidf_mdl = models.TfidfModel(corpus) 

Calling `model[corpus]` only creates a wrapper around the old corpus document stream – actual conversions are done on-the-fly, during document iteration. We cannot convert the entire corpus at the time of calling corpus_transformed = model[corpus], because that would mean storing the result in main memory, and that contradicts gensim’s objective of memory-indepedence. If you will be iterating over the transformed corpus_transformed multiple times, and the transformation is costly, serialize the resulting corpus to disk first and continue using that.

In [31]:
# step 2 -- use the model to transform vectors
corpus_tfidf = tfidf_mdl[corpus]
print(len(corpus_tfidf))

# view one resume
for doc in corpus_tfidf[:1]:
    print(doc)

27869
[(0, 0.19075830486731032), (1, 0.4682655254274302), (2, 0.3599113037331229), (3, 0.3871385595612281), (4, 0.28721377848257035), (5, 0.1969951695186765), (6, 0.3918408487564352), (7, 0.35733996735093254), (8, 0.24979251706536024)]


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
n_features = 1500

tfidf_vec = TfidfVectorizer(input='content', ngram_range=(1, 3), max_df=0.85, min_df=0.05, 
                max_features=n_features, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

tfidf_vec_prep = tfidf_vec.fit_transform(contents)

In [33]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1, n_jobs=-1)

km_mdl = km.fit_predict(tfidf_vec_prep)

In [34]:
len(km_mdl)

27869

In [35]:
# Determine your k range
k_range = range(1,20)

# fit the kmeans model for each n_clusters = k
k_means_var = [KMeans(n_clusters=k).fit(tfidf_vec_prep) for k in k_range]

# pull out the cluster centers for each model
centroids = [X.cluster_centers_ for X in k_means_var]

In [36]:
from scipy.spatial.distance import cdist, pdist

# calculate the euclidean distance from each point to each cluster center
k_euclid = [cdist(tfidf_vec_prep.toarray(), cent, 'euclidean') for cent in centroids]
dist = [np.min(ke, axis=1) for ke in k_euclid]

# total within-cluster sum of squares
wcss = [sum(d**2) for d in dist]

# the total sum of squares
tss = sum(pdist(tfidf_vec_prep.toarray())**2)/tfidf_vec_prep.shape[1]

# the between-cluster sum of squares
bss = tss - wcss

In [37]:
import seaborn as sns
sns.set_style("white")
sns.set_context("poster", font_scale=1.25, rc={"lines.linewidth": 2.5})
sns.set_palette("Set2")

colors = ['#fffc31', '#ff9505', '#ff3c38', '#04e762', '#662c91']
edges = ['#ffb901', '#cc5803', '#eb3968', '#00501e', '#ad66ff']

In [38]:
import numpy as np
from scipy.cluster.vq import kmeans
from scipy.spatial.distance import cdist,pdist
from sklearn import datasets
from sklearn.decomposition import RandomizedPCA
from matplotlib import pyplot as plt
from matplotlib import cm

# perform PCA dimensionality reduction
pca = RandomizedPCA(n_components=2).fit(tfidf_vec_prep.toarray())
X = pca.transform(tfidf_vec_prep.toarray())

##### cluster data into K=1..20 clusters #####
K_MAX = 20
KK = range(1,K_MAX+1)

KM = [kmeans(X,k) for k in KK]
centroids = [cent for (cent,var) in KM]
D_k = [cdist(X, cent, 'euclidean') for cent in centroids]
cIdx = [np.argmin(D,axis=1) for D in D_k]
dist = [np.min(D,axis=1) for D in D_k]

tot_withinss = [sum(d**2) for d in dist]  # Total within-cluster sum of squares
totss = sum(pdist(X)**2)/X.shape[0]       # The total sum of squares
betweenss = totss - tot_withinss          # The between-cluster sum of squares

##### plots #####
kIdx = 4        # K=10
clr = cm.spectral( np.linspace(0,1,10) ).tolist()
mrk = 'os^p<dvh8>+x.'

In [39]:
# make figure
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(111)

# plots
ax.plot(KK, betweenss/totss*100, marker='o', color=colors[-1], alpha=0.5)
ax.plot(KK[kIdx], betweenss[kIdx]/totss*100, marker='o', markersize=25, color=colors[0], alpha=0.5)

# labels/titles
plt.legend(loc="best")
plt.title('Elbow for KMeans Clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Percentage of variance explained (%)')

ax.set_xlim((-0.1,20.5))
ax.set_ylim((-0.5,100))

# remove border
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)

# show grid
ax.xaxis.grid(True, alpha=0.2) 
ax.yaxis.grid(True, alpha=0.2) 

# plot that biddy
plt.savefig('data/{0}.png'.format('KMeans_elbow_var'), bbox_inches='tight')
plt.close(fig)



In [40]:
# make figure
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(111)

# plots
for i in range(kIdx+1):
    ind = (cIdx[kIdx]==i)
    ax.scatter(X[ind,0],X[ind,1], s=65, c=colors[i], marker=mrk[i], 
               label='Cluster {0}'.format(i), alpha=1)

# labels/titles
plt.legend(loc='upper right')
plt.title('K={0} Clusters'.format(KK[kIdx]))

#ax.set_xlim((-.5,.5))
#ax.set_ylim((-.3,.81))

# remove border
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)

# show grid
ax.xaxis.grid(True, alpha=0.2) 
ax.yaxis.grid(True, alpha=0.2) 

# plot that biddy
plt.savefig('data/{0}.png'.format('KMeans_{0}_clusters'.format(KK[kIdx])), bbox_inches='tight')
plt.close(fig)

In [45]:
import seaborn as sns
sns.set_style("white")
sns.set_context("poster", font_scale=1.25, rc={"lines.linewidth": 2.5})
sns.set_palette("Set2")

colors = ['#fffc31', '#ff9505', '#ff3c38', '#04e762', '#662c91']
edges = ['#ffb901', '#cc5803', '#CF000F', '#00501e', '#ad66ff']
labels = ['Web', 'Programs', 'Dev Ops', 'Modules', 'Device']
#alphas = [0.0, 1, 0.0, 0.0, 0.0]
alphas = [1, 1, 1, 1, 1]

In [46]:
# perform PCA dimensionality reduction
pca = RandomizedPCA(n_components=3).fit(tfidf_vec_prep.toarray())
X = pca.transform(tfidf_vec_prep.toarray())

##### cluster data into K=1..20 clusters #####
K_MAX = 20
KK = range(1,K_MAX+1)

KM = [kmeans(X,k) for k in KK]
centroids = [cent for (cent,var) in KM]
D_k = [cdist(X, cent, 'euclidean') for cent in centroids]
cIdx = [np.argmin(D,axis=1) for D in D_k]
dist = [np.min(D,axis=1) for D in D_k]

tot_withinss = [sum(d**2) for d in dist]  # Total within-cluster sum of squares
totss = sum(pdist(X)**2)/X.shape[0]       # The total sum of squares
betweenss = totss - tot_withinss          # The between-cluster sum of squares

##### plots #####
kIdx = 4        # K=10
clr = cm.spectral( np.linspace(0,1,10) ).tolist()
mrk = 'os^p<dvh8>+x.'

In [47]:
import matplotlib.font_manager as fm

prop = fm.FontProperties(fname='/Users/bryant/Library/Fonts/BebasNeue Regular.otf')

In [51]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(18,16))
ax = fig.add_subplot(111, projection='3d')

# Get rid of the panes                          
ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0)) 
ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0)) 
ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0)) 

# Get rid of the spines                         
ax.w_xaxis.line.set_color((1.0, 1.0, 1.0, 0.0)) 
ax.w_yaxis.line.set_color((1.0, 1.0, 1.0, 0.0)) 
ax.w_zaxis.line.set_color((1.0, 1.0, 1.0, 0.0))

#ax.set_axis_off()

# plots
for i in range(kIdx+1):
    ind = (cIdx[kIdx]==i)
    ax.scatter(X[ind,0],X[ind,1], X[ind,2], s=105, c=colors[i], alpha=alphas[i], marker=mrk[i], 
               label=labels[i], edgecolor=edges[i])

# labels/titles
plt.legend(loc='lower right')

ax.set_xlim((-0.4,0.5))
ax.set_ylim((0.0,1.2))
ax.set_zlim((-0.6,0.35))


plt.savefig('data/{0}.png'.format('KMeans_{0}_3D'.format(KK[kIdx])), bbox_inches='tight', dpi=150,)
plt.close(fig)
#plt.show()

# Latent Semantic Indexing Topics

In [None]:
num_topics = 100

# initialize an LSI transformation
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
corpus_lsi = lsi[corpus_tfidf]

In [None]:
# the topics are printed to log
a = lsi.print_topics(8)
a[0]

In [None]:
for doc in corpus_lsi[800]: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    pass
    #print(doc)

# Model Save & Load

In [None]:
lsi.save('pkl/lsi_mdl.lsi')
lsi = models.LsiModel.load('pkl/lsi_mdl.lsi')

# LDA Topics

In [None]:
lda_mdl = models.LdaModel(corpus, id2word=dictionary, num_topics=20)

In [None]:
lda_mdl.top_topics
pprint(lda_mdl.print_topics(10))

In [None]:
print(corpus)

In [None]:
doc = df.iloc[0]['resume_nouns']
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

# Cosine Similarity

In [None]:
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

In [None]:
index.save('pkl/resume_stopped.index')
index = similarities.MatrixSimilarity.load('pkl/resume_stopped.index')

In [None]:
sims = index[vec_lsi] # perform a similarity query against the corpus

# (document_number, document_similarity)
sim_lst = list(enumerate(sims))

In [None]:
import operator
sim_lst.sort(key=operator.itemgetter(1), reverse=True)

In [None]:
# comparing resumes within resumes
sim_lst[1:6]

In [None]:
' '.join(texts[0])