# Vector Spaces

In [1]:
import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import gensim
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from pymongo import MongoClient

from collections import defaultdict
from pprint import pprint
from six import iteritems
import os

import numpy as np
import pandas as pd
import scipy.sparse

## Load Processed Dataframe

In [2]:
df = pd.read_pickle('pkl/df_stop_noun.pkl')
df.head(3)

Unnamed: 0,resume_text,resume_stopped,resume_nouns
0,"Petros Gazazyan North Hollywood, CA Werkervari...",petros gazazyan north hollywood ca werkervarin...,petros gazazyan hollywood ca design engineer s...
1,"Travis London Java Software Engineer Tucson, A...",travis london java software engineer tucson az...,travis london java software engineer tucson az...
2,"Stephen A. Kraft Mechanical Engineer Seattle, ...",stephen kraft mechanical engineer seattle wa b...,stephen kraft mechanical engineer seattle wa b...


## Convert Series to List of Strings

In [39]:
resumes = df['resume_nouns'].tolist()
resumes[:1]

['petros gazazyan hollywood ca design engineer structural ttg engineer pasadena ca december nonstructural equipment anchorage major southern california hospitals accordance asce cbc local codes extensive knowledge experience engineering programs design enercalc etabs hilti profis design remodel buildings beams columns foundations area work physical work remodel ensure work civil engineering student worker los angeles county department public works alhambra ca september publics needs transportation infrastructure project development division los angeles county engineers project managers geographic presentation data gis systems engineering reports documents fund multimillion dollar projects microsoft word excel access multiple projects bikeway coordination disaster reimbursement civil engineering california state university northridge northridge ca']

# From Strings to Vectors

### Tokenize the documents, remove stop words and words that only appear once

In [5]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in resume.split()] for resume in resumes]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# remove words that occur less than n times
texts = [[token for token in text if frequency[token] > 2] for text in texts]

### Save Token Count Dictionary to File

In [6]:
dictionary = corpora.Dictionary(texts)

# store the dictionary, for future reference
dictionary.save('pkl/resume_token.dict')
print(dictionary)

Dictionary(47433 unique tokens: ['zonar', 'thotampali', 'amite', 'junior', 'mclennan']...)


### Convert Tokenized Resumes to Vectors

In [7]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('pkl/resume_token.mm', corpus)  # store to disk, for later use
for c in corpus[:1]:
    print(c)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 3), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 2), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 3), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 2), (69, 1), (70, 1), (71, 1), (72, 1), (73, 4), (74, 1), (75, 4), (76, 1), (77, 2), (78, 1)]


## Corpus Streaming – One Document at a Time

In [8]:
# replace 'texts' with 'open(my_file.txt)' to read from files (one line in the file is a document)
# or loop through and open each individual file (?)
# either way, dictionary.doc2bow wants a list of words (aka - line.lower().split())
class MyCorpus(object):
    def __iter__(self):
        for line in texts:
            yield dictionary.doc2bow(line)

In [9]:
# doesn't load the corpus into memory!
corpus_memory_friendly = MyCorpus()

### Similarly, to construct the dictionary without loading all texts into memory

In [10]:
_ = '''
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))

# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist 
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]

# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)

# remove gaps in id sequence after words that were removed
dictionary.compactify()
print(dictionary)
'''

# Transformation Interface

In [11]:
# load tokenized dictionary
if (os.path.exists('pkl/resume_token.dict')):
    dictionary = corpora.Dictionary.load('pkl/resume_token.dict')
    print('Tokenized dictionary LOADED as \'dictionary\'')
else:
    print('Tokenized dictionary NOT FOUND')

Tokenized dictionary LOADED as 'dictionary'


In [12]:
# load sparse vector matrix
if (os.path.exists('pkl/resume_token.mm')):
    corpus = corpora.MmCorpus('pkl/resume_token.mm')
    print('Sparse matrix LOADED as \'corpus\'')
else:
    print('Sparse matrix NOT FOUND')

Sparse matrix LOADED as 'corpus'


### TF-IDF Transformation

In [13]:
# step 1 -- initialize a model
tfidf_mdl = models.TfidfModel(corpus) 

Calling `model[corpus]` only creates a wrapper around the old corpus document stream – actual conversions are done on-the-fly, during document iteration. We cannot convert the entire corpus at the time of calling corpus_transformed = model[corpus], because that would mean storing the result in main memory, and that contradicts gensim’s objective of memory-indepedence. If you will be iterating over the transformed corpus_transformed multiple times, and the transformation is costly, serialize the resulting corpus to disk first and continue using that.

In [14]:
# step 2 -- use the model to transform vectors
corpus_tfidf = tfidf_mdl[corpus]

# view one resume
for doc in corpus_tfidf[:1]:
    print(doc)

[(0, 0.17271063699473888), (1, 0.08226865295339658), (2, 0.10686051346627509), (3, 0.09697439731236941), (4, 0.053636965423066015), (5, 0.0527571713622434), (6, 0.09466997898226183), (7, 0.058669037157036336), (8, 0.053456723392576275), (9, 0.028229150859176745), (10, 0.11683622359629327), (11, 0.17756298649335311), (12, 0.15797139105680247), (13, 0.20193867602092316), (14, 0.037039715475002524), (15, 0.04534451507790173), (16, 0.07984585576025988), (17, 0.043826208784558764), (18, 0.21606180788503232), (19, 0.09384214238200132), (20, 0.019946706258682077), (21, 0.12493293764695686), (22, 0.19873866503138288), (23, 0.0776726818433162), (24, 0.04512061094377737), (25, 0.057398616551967006), (26, 0.05231333706535211), (27, 0.12291201188606496), (28, 0.019626395159590335), (29, 0.08211146996805817), (30, 0.09806275559265301), (31, 0.06266765793155121), (32, 0.049718241584578736), (33, 0.008687422725094845), (34, 0.07898569552840123), (35, 0.08070256008242034), (36, 0.18141552217773343), (

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
n_features = 1000

tfidf_vec = TfidfVectorizer(input='content', ngram_range=(1, 3), max_df=0.9, min_df=2, 
                max_features=n_features, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

tfidf_vec_prep = tfidf_vec.fit_transform(resumes)

In [45]:
from sklearn.cluster import KMeans
from sklearn import metrics

km = KMeans(n_clusters=8, init='k-means++', max_iter=100, n_init=1)

km_mdl = km.fit_predict(tfidf_vec_prep)

In [55]:
from sklearn.cluster import DBSCAN 
from sklearn.preprocessing import StandardScaler

dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine', algorithm='brute', 
                leaf_size=30, p=None, random_state=None)

dbscan_mdl = dbscan.fit_predict(tfidf_vec_prep)

In [61]:
dbscan.

array([    0,     2,     4, ..., 17044, 17045, 17047])

# Latent Semantic Indexing Topics

In [15]:
num_topics = 100

# initialize an LSI transformation
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
corpus_lsi = lsi[corpus_tfidf]

In [16]:
# the topics are printed to log
lsi.print_topics(2)

[(0,
  '0.137*"sql" + 0.129*"business" + 0.108*"analytics" + 0.104*"hadoop" + 0.100*"oracle" + 0.098*"server" + 0.092*"project" + 0.088*"sales" + 0.087*"hive" + 0.085*"database"'),
 (1,
  '0.291*"hadoop" + 0.281*"hive" + 0.203*"hdfs" + 0.200*"pig" + 0.178*"java" + 0.137*"sqoop" + 0.130*"hbase" + 0.126*"sql" + 0.117*"oracle" + 0.114*"mapreduce"')]

In [17]:
for doc in corpus_lsi[800]: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    pass
    #print(doc)

# Model Save & Load

In [18]:
lsi.save('pkl/lsi_mdl.lsi')
lsi = models.LsiModel.load('pkl/lsi_mdl.lsi')

# LDA Topics

In [20]:
lda_mdl = models.LdaModel(corpus, id2word=dictionary, num_topics=20)

In [33]:
lda_mdl.top_topics
pprint(lda_mdl.print_topics(10))

[(6,
  '0.011*data + 0.009*design + 0.007*mn + 0.007*ok + 0.006*production + '
  '0.005*sirna + 0.005*development + 0.005*multiplex + 0.005*art + '
  '0.004*oklahoma'),
 (3,
  '0.024*data + 0.022*analysis + 0.021*university + 0.013*research + '
  '0.012*science + 0.011*r + 0.009*scientist + 0.007*statistical + '
  '0.006*python + 0.006*model'),
 (2,
  '0.020*management + 0.017*project + 0.013*process + 0.010*analysis + '
  '0.010*business + 0.010*team + 0.009*development + 0.008*system + '
  '0.008*support + 0.008*training'),
 (9,
  '0.017*support + 0.017*security + 0.016*software + 0.016*systems + '
  '0.014*data + 0.012*system + 0.009*management + 0.009*information + '
  '0.008*network + 0.008*technical'),
 (18,
  '0.043*ca + 0.015*san + 0.014*c + 0.014*engineering + 0.014*software + '
  '0.012*data + 0.012*engineer + 0.010*design + 0.008*development + '
  '0.007*california'),
 (15,
  '0.067*data + 0.022*sql + 0.013*business + 0.012*database + 0.011*oracle + '
  '0.010*etl + 0.010*de

In [32]:
hdp = models.hdpmodel.HdpModel(corpus, id2word=dictionary)

In [None]:
hdp_topics = hdp.show_topics(topics=5, topn=5, log=False, formatted=False)

In [None]:
hdp.optimal_ordering()
hdp_topics