# gensim demo

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from gensim import corpora, models, similarities

## from strings to vectors

In [3]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

This is a tiny corpus of nine documents, each consisting of only a single sentence.


First, let’s tokenize the documents, remove common words (using a toy stoplist) as well as words that only appear once in the corpus:

In [4]:
#remove common words and tokenize
stoplist = set('for a of the to in and'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

In [5]:
from pprint import pprint

In [6]:
pprint(texts)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]


In [7]:
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token]+=1
    
texts = [[token for token in text if frequency[token]>1] for text in texts]

In [8]:
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


## Using bag-of-words approach:
It is advantageous to represent the questions only by their (integer) ids. The mapping between questions and ids is called a dictionary.

In [9]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/home/kesj/tmp/deerwester.dict') #store this dictionary for future reference
print(dictionary)

Dictionary(12 unique tokens: ['graph', 'trees', 'computer', 'interface', 'human']...)


In [10]:
print(dictionary.token2id)

{'graph': 10, 'trees': 9, 'computer': 1, 'interface': 0, 'human': 2, 'survey': 4, 'time': 7, 'minors': 11, 'response': 3, 'user': 6, 'system': 5, 'eps': 8}


This creates an n-dimensional vector where $n$ is the number of words in my corporus.
here $n$ = 12.

In [11]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())

In [12]:
print(new_vec)

[(1, 1), (2, 1)]


In [13]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/home/kesj/tmp/deerwester.mm', corpus)
pprint(corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(0, 1), (5, 1), (6, 1), (8, 1)],
 [(2, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


## initialize the *transformation* using TFIDF

In [14]:
tfidf = models.TfidfModel(corpus)
vec = [(0,1),(4,1)]
print(tfidf[vec])

[(0, 0.7071067811865476), (4, 0.7071067811865476)]


In [15]:
print(tfidf[new_vec])

[(1, 0.7071067811865476), (2, 0.7071067811865476)]


In [16]:
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)

In [17]:
sims=index[tfidf[vec]]
print(list(enumerate(sims)))

[(0, 0.40824828), (1, 0.31412902), (2, 0.40376222), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.44424552)]


## Corpus Streaming -- One Document at a Time
Note that _corpus_ above resides fully in memory, as a plain Python list. In this simple example, it doesn’t matter much, but just to make things clear, let’s assume there are millions of documents in the corpus. Storing all of them in RAM won’t do. Instead, let’s assume the documents are stored in a file on disk, one document per line. Gensim only requires that a corpus must be able to return one document vector at a time:

In [18]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('/san-data/shared/kesj/mycorpus.txt'):
            # assume there is one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())


The assumption that each document occupies one line in a single file is not important; you can mold the `__iter__` function to fit your input format, whatever it is. Walking directories, parsing XML, accessing network... Just parse your input to retrieve a clean list of tokens in each document, then convert the tokens via a dictionary to their ids and yield the resulting sparse vector inside `__iter__`.

#### Note I need to do this to parse the pandas dataframe text

In [19]:
corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!

In [20]:
print(corpus_memory_friendly)

<__main__.MyCorpus object at 0x7f6897bae908>


Corpus is now an object (without any defined print method) so `print` just outputs the address of the object in memory. This isn't very useful. To see the constituent vectors, iterate over the corpus and print each document vector (one at a time).

In [21]:
for vector in corpus_memory_friendly: # load one vector into memor at a time
    print(vector)

[(0, 1), (1, 1), (2, 1)]
[(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(0, 1), (5, 1), (6, 1), (8, 1)]
[(2, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


Similarly to construct the dictionary without loading all texts into memory:

In [22]:
dictionary = corpora.Dictionary(line.lower().split() for line in open('/san-data/shared/kesj/mycorpus.txt'))
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
            if stopword in dictionary.token2id]
#once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] works with python2.x
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1] #works with python3.x
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
pprint(len(dictionary.token2id))


12


In [23]:
dictionary.token2id

{'computer': 10,
 'eps': 11,
 'graph': 2,
 'human': 5,
 'interface': 1,
 'minors': 3,
 'response': 8,
 'survey': 6,
 'system': 0,
 'time': 7,
 'trees': 4,
 'user': 9}

## I want to look at the difference between using tfidf and idf for the lda part

In [24]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#plt.hist(tfidf.dfs.values())

In [27]:
tfidf.dfs

{0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 3, 6: 3, 7: 2, 8: 2, 9: 3, 10: 3, 11: 2}

In [28]:
tfidf.idfs

{0: 2.1699250014423126,
 1: 2.1699250014423126,
 2: 2.1699250014423126,
 3: 2.1699250014423126,
 4: 2.1699250014423126,
 5: 1.5849625007211563,
 6: 1.5849625007211563,
 7: 2.1699250014423126,
 8: 2.1699250014423126,
 9: 1.5849625007211563,
 10: 1.5849625007211563,
 11: 2.1699250014423126}

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [52]:
skl_tfidfvect = TfidfVectorizer(stop_words=stoplist,min_df=2)

In [46]:
# " ".join(
#texts_str = " ".join([line for line in documents])

In [38]:
#texts_str

'Human machine interface for lab abc computer applications A survey of user opinion of computer system response time The EPS user interface management system System and human system engineering testing of EPS Relation of user perceived response time to error measurement The generation of random binary unordered trees The intersection graph of paths in trees Graph minors IV Widths of trees and well quasi ordering Graph minors A survey'

In [40]:
stoplist

{'a', 'and', 'for', 'in', 'of', 'the', 'to'}

In [53]:
sklX= skl_tfidfvect.fit_transform(documents)

In [58]:
skl_tfidfvect.vocabulary_, dictionary.token2id

({'computer': 0,
  'eps': 1,
  'graph': 2,
  'human': 3,
  'interface': 4,
  'minors': 5,
  'response': 6,
  'survey': 7,
  'system': 8,
  'time': 9,
  'trees': 10,
  'user': 11},
 {'computer': 10,
  'eps': 11,
  'graph': 2,
  'human': 5,
  'interface': 1,
  'minors': 3,
  'response': 8,
  'survey': 6,
  'system': 0,
  'time': 7,
  'trees': 4,
  'user': 9})

In [59]:
sklX.todense()

matrix([[ 0.57735027,  0.        ,  0.        ,  0.57735027,  0.57735027,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ],
        [ 0.42593857,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.42593857,  0.42593857,  0.37034129,  0.42593857,
          0.        ,  0.37034129],
        [ 0.        ,  0.53361154,  0.        ,  0.        ,  0.53361154,
          0.        ,  0.        ,  0.        ,  0.46395983,  0.        ,
          0.        ,  0.46395983],
        [ 0.        ,  0.44614767,  0.        ,  0.44614767,  0.        ,
          0.        ,  0.        ,  0.        ,  0.77582505,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.6023681 ,  0.        ,  0.        ,  0.6023681 ,
          0.        ,  0.52374168],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
      

In [68]:
skl_tfidfvect.idf_, skl_tfidfvect.vocabulary_

(array([ 2.2039728 ,  2.2039728 ,  1.91629073,  2.2039728 ,  2.2039728 ,
         2.2039728 ,  2.2039728 ,  2.2039728 ,  1.91629073,  2.2039728 ,
         1.91629073,  1.91629073]),
 {'computer': 0,
  'eps': 1,
  'graph': 2,
  'human': 3,
  'interface': 4,
  'minors': 5,
  'response': 6,
  'survey': 7,
  'system': 8,
  'time': 9,
  'trees': 10,
  'user': 11})

In [80]:
document_counts = defaultdict(int)
for doc in texts:
    for word in doc:
        document_counts[word]+=1
    
document_counts, len(texts)

(defaultdict(int,
             {'computer': 2,
              'eps': 2,
              'graph': 3,
              'human': 2,
              'interface': 2,
              'minors': 2,
              'response': 2,
              'survey': 2,
              'system': 4,
              'time': 2,
              'trees': 3,
              'user': 3}),
 9)

In [88]:
import numpy as np

[(key,np.log(9/val+1) )for key,val in document_counts.items()]

[('graph', 1.3862943611198906),
 ('trees', 1.3862943611198906),
 ('computer', 1.7047480922384253),
 ('interface', 1.7047480922384253),
 ('human', 1.7047480922384253),
 ('survey', 1.7047480922384253),
 ('time', 1.7047480922384253),
 ('minors', 1.7047480922384253),
 ('response', 1.7047480922384253),
 ('user', 1.3862943611198906),
 ('system', 1.1786549963416462),
 ('eps', 1.7047480922384253)]

In [89]:
0.166*1.7047

0.2829802

In [74]:
[(key,frequency[key[0]]/12) for key in skl_tfidfvect.vocabulary_.items()]

[(('trees', 10), 0.25),
 (('minors', 5), 0.16666666666666666),
 (('human', 3), 0.16666666666666666),
 (('interface', 4), 0.16666666666666666),
 (('response', 6), 0.16666666666666666),
 (('user', 11), 0.25),
 (('system', 8), 0.3333333333333333),
 (('graph', 2), 0.25),
 (('time', 9), 0.16666666666666666),
 (('computer', 0), 0.16666666666666666),
 (('eps', 1), 0.16666666666666666),
 (('survey', 7), 0.16666666666666666)]

In [63]:
tfidf.idfs,

{0: 2.1699250014423126,
 1: 2.1699250014423126,
 2: 2.1699250014423126,
 3: 2.1699250014423126,
 4: 2.1699250014423126,
 5: 1.5849625007211563,
 6: 1.5849625007211563,
 7: 2.1699250014423126,
 8: 2.1699250014423126,
 9: 1.5849625007211563,
 10: 1.5849625007211563,
 11: 2.1699250014423126}

In [65]:
dictionary.token2id

{'computer': 10,
 'eps': 11,
 'graph': 2,
 'human': 5,
 'interface': 1,
 'minors': 3,
 'response': 8,
 'survey': 6,
 'system': 0,
 'time': 7,
 'trees': 4,
 'user': 9}

In [90]:
from sklearn.feature_extraction.text import CountVectorizer


In [91]:
skl_countvect = CountVectorizer(stop_words=stoplist,min_df=2)


In [92]:
skl_CountX = skl_countvect.fit_transform(documents)

In [93]:
skl_CountX.todense()

matrix([[1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1],
        [0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1],
        [0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
        [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0]], dtype=int64)

In [94]:
skl_countvect.vocabulary_

{'computer': 0,
 'eps': 1,
 'graph': 2,
 'human': 3,
 'interface': 4,
 'minors': 5,
 'response': 6,
 'survey': 7,
 'system': 8,
 'time': 9,
 'trees': 10,
 'user': 11}

In [95]:
tfidf.dfs

{0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 3, 6: 3, 7: 2, 8: 2, 9: 3, 10: 3, 11: 2}

In [96]:
len(documents)

9

In [None]:
from sklearn.decomposition import 