# Gensim Tutorial

In [1]:
from gensim import corpora

# Declare the documents

In [2]:
documents = ["Human machine interface for lab abc computer applications",
              "A survey of user opinion of computer system response time",
              "The EPS user interface management system",
              "System and human system engineering testing of EPS",
              "Relation of user perceived response time to error measurement",
              "The generation of random binary unordered trees",
              "The intersection graph of paths in trees",
              "Graph minors IV Widths of trees and well quasi ordering",
              "Graph minors A survey"]

# Remove Stop words

In [3]:
stoplist = set("for a the in at of and to".split())
texts = [[word for word in document.lower().split() if word not in stoplist] 
         for document in documents]

# Remove words which are infrequent

In [4]:
from collections import defaultdict
frequency = defaultdict(int)

for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

from pprint import pprint
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


# Bag of words 

In this model, each document is represented by a single vector where each vector element answers the question 

"How many times the word 'system' appears in the document?"

The questions are represented by ids and the mapping from a question to id is called a dictionary


In [5]:
dictionary = corpora.Dictionary(texts)
print dictionary
print dictionary.token2id

Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...)
{u'minors': 11, u'graph': 10, u'system': 6, u'trees': 9, u'eps': 8, u'computer': 1, u'survey': 5, u'user': 7, u'human': 2, u'time': 4, u'interface': 0, u'response': 3}


# New document

We can use the above dictionary to build vectors from a new document. Please see the below example


In [6]:
new_doc = "Human computer interaction"
new_vector = dictionary.doc2bow(new_doc.lower().split())
print new_vector

[(1, 1), (2, 1)]


The above vector means that in the dictionary, the words with id=1 and id=2 appear only 1 time each.

# Corpus

A corpus is a collection of documents. Now, how do we create a collection of document vectors ?
Please see the following

In [7]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)
pprint(corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(0, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (6, 2), (8, 1)],
 [(3, 1), (4, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(5, 1), (10, 1), (11, 1)]]


Thus, it is clear that the word with id=10(graph) appears 0 times in the first 6 documents and 1 time each in the remaining 3 documents. This is the bag of words model

# Corpus Streaming 



In [8]:
class MyCorpus(object):
    def __iter__(self):
        for line in open('mycorpus.txt'):
            yield dictionary.doc2bow(line.lower().split())

In [9]:
corpus_memory_friendly = MyCorpus()
print corpus_memory_friendly

<__main__.MyCorpus object at 0x10943a850>


In [10]:
for vector in corpus_memory_friendly:
    print vector

[(0, 1), (1, 1), (2, 1)]
[(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(0, 1), (6, 1), (7, 1), (8, 1)]
[(2, 1), (6, 2), (8, 1)]
[(3, 1), (4, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(5, 1), (10, 1), (11, 1)]


Similarly, to construct a dictionary without loading all the text in the memory

In [11]:
from six import iteritems

# Build a dictionary
dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))

# Stopwords
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
            if stopword in dictionary.token2id]

# frequency =1
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]

dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
pprint(dictionary.token2id)

{u'computer': 5,
 u'eps': 4,
 u'graph': 1,
 u'human': 8,
 u'interface': 10,
 u'minors': 0,
 u'response': 11,
 u'survey': 6,
 u'system': 2,
 u'time': 9,
 u'trees': 3,
 u'user': 7}
