# Gensim Exploration

### Import all the necessary libraries

In [1]:
import gensim
from pprint import pprint

### Prepare the data

In [2]:
t_corpus = [
    "A survey of user opinion of computer system response time",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
]

pprint(t_corpus)

['A survey of user opinion of computer system response time',
 'Relation of user perceived response time to error measurement',
 'The generation of random binary unordered trees',
 'The intersection graph of paths in trees',
 'Graph minors IV Widths of trees and well quasi ordering']


### Prepare the stop words 

In [3]:
stoplist = set(gensim.parsing.preprocessing.STOPWORDS)
pprint(stoplist)

{'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'bill',
 'both',
 'bottom',
 'but',
 'by',
 'call',
 'can',
 'cannot',
 'cant',
 'co',
 'computer',
 'con',
 'could',
 'couldnt',
 'cry',
 'de',
 'describe',
 'detail',
 'did',
 'didn',
 'do',
 'does',
 'doesn',
 'doing',
 'don',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eg',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'etc',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifte

## Preprocess the data
Preprocess the data by removing the stop words and use the simple_preprocess function to tokenize the data.
the simple_preprocess function will convert a document into a list of tokens.

In [4]:
processed_corpus = [
    [word for word in gensim.utils.simple_preprocess(document, deacc=True) if word not in stoplist]
    for document in t_corpus
]

pprint(processed_corpus)

[['survey', 'user', 'opinion', 'response', 'time'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'quasi', 'ordering']]


## Create the dictionary

Dictionary encapsulates the mapping between normalized words and their integer ids. It is used to determine the vocabulary size, as well as for debugging and topic printing.

In [5]:
dictionary = gensim.corpora.Dictionary(processed_corpus)
pprint(dictionary.token2id)
print(dictionary)

{'binary': 9,
 'error': 5,
 'generation': 10,
 'graph': 14,
 'intersection': 15,
 'iv': 17,
 'measurement': 6,
 'minors': 18,
 'opinion': 0,
 'ordering': 19,
 'paths': 16,
 'perceived': 7,
 'quasi': 20,
 'random': 11,
 'relation': 8,
 'response': 1,
 'survey': 2,
 'time': 3,
 'trees': 12,
 'unordered': 13,
 'user': 4,
 'widths': 21}
Dictionary<22 unique tokens: ['opinion', 'response', 'survey', 'time', 'user']...>


### Create the Bag of Words
Bag of Words (BoW) is a common way of representing text data. It describes the occurrence of words within a document.

In [6]:
BoW_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint(BoW_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1), (12, 1), (13, 1)],
 [(12, 1), (14, 1), (15, 1), (16, 1)],
 [(12, 1), (14, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)]]


### Create the TF-IDF model
TF-IDF stands for Term Frequency-Inverse Document Frequency. It is a way to score the importance of words (or "terms") in a document based on how frequently they appear across multiple documents.

In [7]:
tfidf = gensim.models.TfidfModel(BoW_corpus)

In [8]:
words = "trees graph".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(12, 0.4869354917707381), (14, 0.8734379353188121)]


### Similarity Queries
Gensim provides a simple interface for performing similarity queries using the model.

In [None]:
index = gensim.similarities.SparseMatrixSimilarity(tfidf[BoW_corpus], num_features=len(t_corpus))
query_document = 'trees system'.split()
query_bow = dictionary.doc2bow(query_document)
simils = index[tfidf[query_bow]]
print(list(enumerate(simils)))

In [None]:
for doc_number, score in sorted(enumerate(simils), key=lambda x: x[1], reverse=True):
   print(doc_number, score)