# Vector Spaces

In [17]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import gensim
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from pymongo import MongoClient

from collections import defaultdict
from pprint import pprint
from six import iteritems
import os

import numpy as np
import pandas as pd
import scipy.sparse

## Load Processed Dataframe

In [2]:
df = pd.read_pickle('pkl/df_stop_noun.pkl')
df.head(3)

Unnamed: 0,resume_text,resume_stopped,resume_nouns
0,"Petros Gazazyan North Hollywood, CA Werkervari...",petros gazazyan north hollywood ca werkervarin...,petros gazazyan hollywood ca design engineer s...
1,"Travis London Java Software Engineer Tucson, A...",travis london java software engineer tucson az...,travis london java software engineer tucson az...
2,"Stephen A. Kraft Mechanical Engineer Seattle, ...",stephen kraft mechanical engineer seattle wa b...,stephen kraft mechanical engineer seattle wa b...


## Convert Series to List of Strings

In [3]:
resumes = df['resume_nouns'].tolist()
resumes[:2]

['petros gazazyan hollywood ca design engineer structural ttg engineer pasadena ca december nonstructural equipment anchorage major southern california hospitals accordance asce cbc local codes extensive knowledge experience engineering programs design enercalc etabs hilti profis design remodel buildings beams columns foundations area work physical work remodel ensure work civil engineering student worker los angeles county department public works alhambra ca september publics needs transportation infrastructure project development division los angeles county engineers project managers geographic presentation data gis systems engineering reports documents fund multimillion dollar projects microsoft word excel access multiple projects bikeway coordination disaster reimbursement civil engineering california state university northridge northridge ca',
 'travis london java software engineer tucson az bereid overal naartoe te verhuizen engineer contract senior software engineer tucson az se

# From Strings to Vectors

### Tokenize the documents, remove stop words and words that only appear once

In [4]:
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in resume.split()] for resume in resumes]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# remove words that occur less than n times
texts = [[token for token in text if frequency[token] > 2] for text in texts]

### Save Token Count Dictionary to File

In [5]:
dictionary = corpora.Dictionary(texts)

# store the dictionary, for future reference
dictionary.save('pkl/resume_token.dict')
print(dictionary)

2016-08-18 11:24:47,265 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2016-08-18 11:24:51,071 : INFO : adding document #10000 to Dictionary(42148 unique tokens: ['gerwien', 'globallogic', 'coaf', 'umatilla', 'collates']...)
2016-08-18 11:24:53,322 : INFO : built Dictionary(47433 unique tokens: ['umatilla', 'collates', 'refactors', 'iaw', 'certificationsalesforcecom']...) from 17049 documents (total 6995164 corpus positions)
2016-08-18 11:24:53,323 : INFO : saving Dictionary object under pkl/resume_token.dict, separately None


Dictionary(47433 unique tokens: ['umatilla', 'collates', 'refactors', 'iaw', 'certificationsalesforcecom']...)


### Convert Tokenized Resumes to Vectors

In [19]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('pkl/resume_token.mm', corpus)  # store to disk, for later use
for c in corpus[:1]:
    print(c)

2016-08-18 11:50:09,301 : INFO : storing corpus in Matrix Market format to pkl/resume_token.mm
2016-08-18 11:50:09,302 : INFO : saving sparse matrix to pkl/resume_token.mm
2016-08-18 11:50:09,302 : INFO : PROGRESS: saving document #0
2016-08-18 11:50:09,633 : INFO : PROGRESS: saving document #1000
2016-08-18 11:50:10,118 : INFO : PROGRESS: saving document #2000
2016-08-18 11:50:10,571 : INFO : PROGRESS: saving document #3000
2016-08-18 11:50:11,022 : INFO : PROGRESS: saving document #4000
2016-08-18 11:50:11,431 : INFO : PROGRESS: saving document #5000
2016-08-18 11:50:11,846 : INFO : PROGRESS: saving document #6000
2016-08-18 11:50:12,204 : INFO : PROGRESS: saving document #7000
2016-08-18 11:50:12,522 : INFO : PROGRESS: saving document #8000
2016-08-18 11:50:12,873 : INFO : PROGRESS: saving document #9000
2016-08-18 11:50:13,238 : INFO : PROGRESS: saving document #10000
2016-08-18 11:50:13,616 : INFO : PROGRESS: saving document #11000
2016-08-18 11:50:13,903 : INFO : PROGRESS: saving

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 3), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 3), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 2), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 2), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 1), (64, 4), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 2), (76, 2), (77, 4), (78, 1)]


## Corpus Streaming – One Document at a Time

In [7]:
# replace 'texts' with 'open(my_file.txt)' to read from files (one line in the file is a document)
# or loop through and open each individual file (?)
# either way, dictionary.doc2bow wants a list of words (aka - line.lower().split())
class MyCorpus(object):
    def __iter__(self):
        for line in texts:
            yield dictionary.doc2bow(line)

In [13]:
# doesn't load the corpus into memory!
corpus_memory_friendly = MyCorpus()

### Similarly, to construct the dictionary without loading all texts into memory

In [14]:
_ = '''
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))

# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist 
            if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]

# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)

# remove gaps in id sequence after words that were removed
dictionary.compactify()
print(dictionary)
'''

# Transformation Interface

In [21]:
# load tokenized dictionary
if (os.path.exists('pkl/resume_token.dict')):
    dictionary = corpora.Dictionary.load('pkl/resume_token.dict')
    print('Tokenized dictionary LOADED as \'dictionary\'')
else:
    print('Tokenized dictionary NOT FOUND')

2016-08-18 11:51:39,304 : INFO : loading Dictionary object from pkl/resume_token.dict


Tokenized dictionary LOADED as 'dictionary'


In [22]:
# load sparse vector matrix
if (os.path.exists('pkl/resume_token.mm')):
    corpus = corpora.MmCorpus('pkl/resume_token.mm')
    print('Sparse matrix LOADED as \'corpus\'')
else:
    print('Sparse matrix NOT FOUND')

2016-08-18 11:51:40,384 : INFO : loaded corpus index from pkl/resume_token.mm.index
2016-08-18 11:51:40,385 : INFO : initializing corpus reader from pkl/resume_token.mm
2016-08-18 11:51:40,385 : INFO : accepted corpus with 17049 documents, 47433 features, 3747585 non-zero entries


Sparse matrix LOADED as 'corpus'


### TF-IDF Transformation

In [23]:
# step 1 -- initialize a model
tfidf_mdl = models.TfidfModel(corpus) 

2016-08-18 11:52:50,113 : INFO : collecting document frequencies
2016-08-18 11:52:50,115 : INFO : PROGRESS: processing document #0
2016-08-18 11:52:56,897 : INFO : PROGRESS: processing document #10000
2016-08-18 11:53:00,848 : INFO : calculating IDF weights for 17049 documents and 47432 features (3747585 matrix non-zeros)


Calling `model[corpus]` only creates a wrapper around the old corpus document stream – actual conversions are done on-the-fly, during document iteration. We cannot convert the entire corpus at the time of calling corpus_transformed = model[corpus], because that would mean storing the result in main memory, and that contradicts gensim’s objective of memory-indepedence. If you will be iterating over the transformed corpus_transformed multiple times, and the transformation is costly, serialize the resulting corpus to disk first and continue using that.

In [29]:
# step 2 -- use the model to transform vectors
corpus_tfidf = tfidf_mdl[corpus]

# view one resume
for doc in corpus_tfidf[:1]:
    print(doc)

[(0, 0.09697439731236943), (1, 0.01994670625868208), (2, 0.11683622359629328), (3, 0.030254916666750908), (4, 0.050117949458092725), (5, 0.008687422725094847), (6, 0.062312898778185806), (7, 0.12291201188606497), (8, 0.10929528811163892), (9, 0.21606180788503235), (10, 0.1948861293470026), (11, 0.07715334861543517), (12, 0.18141552217773346), (13, 0.03443172309251477), (14, 0.06266765793155121), (15, 0.1579713910568025), (16, 0.05363696542306602), (17, 0.034978307404207425), (18, 0.17271063699473888), (19, 0.09806275559265303), (20, 0.026715620471933557), (21, 0.09579563734550717), (22, 0.17271063699473888), (23, 0.1286770479452766), (24, 0.05708005820057115), (25, 0.1576960892928475), (26, 0.014893087963931375), (27, 0.0031983449109230845), (28, 0.1987386650313829), (29, 0.029200285539392586), (30, 0.04382620878455877), (31, 0.015930497729036763), (32, 0.07833740220461406), (33, 0.12493293764695687), (34, 0.05345672339257628), (35, 0.0527571713622434), (36, 0.09384214238200134), (37, 

# Latent Semantic Indexing Topics

In [33]:
num_topics = 10

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi


2016-08-18 11:58:52,709 : INFO : using serial LSI version on this node
2016-08-18 11:58:52,710 : INFO : updating model with new documents
2016-08-18 11:59:07,640 : INFO : preparing a new chunk of documents
2016-08-18 11:59:08,595 : INFO : using 100 extra samples and 2 power iterations
2016-08-18 11:59:08,596 : INFO : 1st phase: constructing (47433, 110) action matrix
2016-08-18 11:59:09,046 : INFO : orthonormalizing (47433, 110) action matrix
2016-08-18 11:59:12,156 : INFO : 2nd phase: running dense svd on (110, 17049) matrix
2016-08-18 11:59:12,535 : INFO : computing the final decomposition
2016-08-18 11:59:12,536 : INFO : keeping 10 factors (discarding 56.308% of energy spectrum)
2016-08-18 11:59:12,567 : INFO : processed documents up to #17049
2016-08-18 11:59:12,572 : INFO : topic #0(24.717): 0.137*"sql" + 0.129*"business" + 0.108*"analytics" + 0.104*"hadoop" + 0.100*"oracle" + 0.098*"server" + 0.092*"project" + 0.088*"sales" + 0.087*"hive" + 0.085*"database"
2016-08-18 11:59:12,57

In [34]:
# the topics are printed to log
lsi.print_topics(num_topics)

2016-08-18 11:59:12,816 : INFO : topic #0(24.717): 0.137*"sql" + 0.129*"business" + 0.108*"analytics" + 0.104*"hadoop" + 0.100*"oracle" + 0.098*"server" + 0.092*"project" + 0.088*"sales" + 0.087*"hive" + 0.085*"database"
2016-08-18 11:59:12,818 : INFO : topic #1(14.917): -0.291*"hadoop" + -0.281*"hive" + -0.203*"hdfs" + -0.200*"pig" + -0.178*"java" + -0.137*"sqoop" + -0.130*"hbase" + -0.126*"sql" + -0.117*"oracle" + -0.114*"mapreduce"
2016-08-18 11:59:12,820 : INFO : topic #2(11.035): -0.212*"hive" + -0.205*"hadoop" + 0.194*"sql" + -0.163*"hdfs" + -0.159*"pig" + -0.152*"entry" + 0.150*"etl" + 0.140*"informatica" + 0.126*"oracle" + 0.120*"server"
2016-08-18 11:59:12,821 : INFO : topic #3(10.260): 0.224*"analytics" + -0.178*"entry" + 0.147*"media" + 0.127*"google" + 0.125*"digital" + 0.117*"strategy" + 0.111*"hadoop" + -0.109*"sql" + 0.108*"market" + -0.106*"server"
2016-08-18 11:59:12,823 : INFO : topic #4(8.914): 0.235*"sas" + 0.179*"scientist" + 0.174*"laboratory" + 0.146*"research" +

[(0,
  '0.137*"sql" + 0.129*"business" + 0.108*"analytics" + 0.104*"hadoop" + 0.100*"oracle" + 0.098*"server" + 0.092*"project" + 0.088*"sales" + 0.087*"hive" + 0.085*"database"'),
 (1,
  '-0.291*"hadoop" + -0.281*"hive" + -0.203*"hdfs" + -0.200*"pig" + -0.178*"java" + -0.137*"sqoop" + -0.130*"hbase" + -0.126*"sql" + -0.117*"oracle" + -0.114*"mapreduce"'),
 (2,
  '-0.212*"hive" + -0.205*"hadoop" + 0.194*"sql" + -0.163*"hdfs" + -0.159*"pig" + -0.152*"entry" + 0.150*"etl" + 0.140*"informatica" + 0.126*"oracle" + 0.120*"server"'),
 (3,
  '0.224*"analytics" + -0.178*"entry" + 0.147*"media" + 0.127*"google" + 0.125*"digital" + 0.117*"strategy" + 0.111*"hadoop" + -0.109*"sql" + 0.108*"market" + -0.106*"server"'),
 (4,
  '0.235*"sas" + 0.179*"scientist" + 0.174*"laboratory" + 0.146*"research" + -0.140*"sales" + 0.127*"r" + 0.123*"python" + 0.122*"cell" + 0.105*"clinical" + 0.104*"statistical"'),
 (5,
  '-0.254*"network" + -0.249*"cisco" + 0.241*"sas" + -0.193*"engineer" + -0.126*"switches" + 

In [None]:
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
...     print(doc)