In [1]:
def tokenize(s):
    return s.lower().split()

text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

## Resources
[amount of training data](https://stackoverflow.com/questions/48059145/how-much-data-is-actually-required-to-train-a-doc2vec-model)

# [Core Concepts](https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html)
## preprocessed corpus: *filtering by stopwords and token frequencies*
- this is a bit simplistic

also see [`simple_preprocess()`](https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess)

In [4]:
stoplist = set('for a of the and to in'.split(' '))
# remove stopwords
texts = [[word for word in document.lower().split() if word not in stoplist] 
             for document in text_corpus]

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
clean_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
clean_corpus

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

# gensim dictionary
[Construct word<->id mappings](https://radimrehurek.com/gensim/corpora/dictionary.html)

In [3]:
from gensim import corpora
dictionary = corpora.Dictionary(clean_corpus)
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


# transforming new docs
Has to be tokenized first!

`(token_id, token_count)`

In [4]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(tokenize(new_doc))  
print(new_vec)

[(0, 1), (1, 1)]


#  vectorize Corpus 

In [7]:
bow_corpus = [dictionary.doc2bow(toks) for toks in clean_corpus]
bow_corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]

# Model

"The tf-idf model transforms vectors from the bag-of-words representation to a vector space where the frequency counts are weighted according to the relative rarity of each word in the corpus."

In [6]:
from gensim import models

tfidf = models.TfidfModel(bow_corpus) # this fits it

sample_words = tokenize("system minors")
print(tfidf[dictionary.doc2bow(sample_words)]) # model is like a dictionary

[(5, 0.5898341626740045), (11, 0.8075244024440723)]


# [Corpora and Vector Spaces](https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html)

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(texts)
