In [1]:
#Imports
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Binarizer
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

In [2]:
#loading the stemmed data with pickle
stem_data = pickle.load( open( "stemmed_data.p", "rb" ) )

In [3]:
#Printing first entry for inspection
stem_data[0]

'car wonder enlighten car saw dai door sport car look late earli call bricklin door small addit bumper separ rest bodi know tellm model engin spec year product car histori info funki look car mail thank'

## Bag-of-words (sklearn)

In [4]:
def output_vec(vectorizer, corpus):
    """ Takes the data, and the vectorizer as arguments.
        Fits the vectorizer to the data and transforms it into a DF"""
    
    #fits vectors to the provided corpus
    fitted_vec = vectorizer.fit_transform(corpus)
    
    #gets feature names
    features = vectorizer.get_feature_names()

    
    #creates an index for the documents in each row of the DF
    doc_nums = ['Doc{:d}'.format(idx) for idx, _ in enumerate(corpus)]
    
    #creates DF
    my_df = pd.DataFrame(data=fitted_vec.toarray(), index=doc_nums, columns=features)
    
    
    return my_df

In [5]:
#Absolute Frequencies Vectorizer
vec_abs = CountVectorizer(max_df=0.95, min_df=0.05)

#Relative Frequencies Vectorizer
vec_rel = TfidfVectorizer(max_df=0.95, min_df=0.05,use_idf=False, norm='l1')

#TF-IDF Vectorizer
vec_tfidf = TfidfVectorizer(max_df=0.95, min_df=0.05, smooth_idf=False)

### a. Vectorization
#### 1. Absolute frequency

   - This method creates a bag of words, which counts the absolute frequency of each word in the doc
        - **Advantage:** considers word importance, is interpretable
        - **Disadvantage:** hard to compare documents with different lengths

In [6]:
output_vec(vec_abs, stem_data)

Unnamed: 0,abl,accept,actual,address,advanc,ago,agre,allow,american,answer,...,won,wonder,word,work,world,write,wrong,wrote,ye,year
Doc0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
Doc1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Doc2,0,0,1,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
Doc3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
Doc4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,2,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc11309,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Doc11310,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Doc11311,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Doc11312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0


#### 2. Relative frequency
   - This method divides the absolute frequency by the length of the document
        - **Advantage:** considers document length, is interpretable
        - **Disadvantage:** words that are common among all documents are overweighed

In [7]:
output_vec(vec_rel, stem_data)

Unnamed: 0,abl,accept,actual,address,advanc,ago,agre,allow,american,answer,...,won,wonder,word,work,world,write,wrong,wrote,ye,year
Doc0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.058824,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.058824
Doc1,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.062500,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
Doc2,0.000000,0.0,0.016949,0.000000,0.016949,0.0,0.0,0.0,0.0,0.016949,...,0.0,0.016949,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
Doc3,0.000000,0.0,0.000000,0.041667,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.041667,0.00000,0.041667,0.000000,0.000000
Doc4,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.038462,0.0,0.000000,0.076923,0.038462,0.00000,0.000000,0.038462,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc11309,0.030303,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.030303
Doc11310,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.071429,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
Doc11311,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.076923,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
Doc11312,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.031250,0.03125,0.000000,0.000000,0.000000


#### 3. TF-IDF frequency
   - This method considers the occurence among all documents
        - **Advantage:** considers the occurence in al documents 
             - artificially increases importance of rare words
             - artificially decreases importance of very frequent words
        - **Disadvantage:** Difficult interpretation

In [8]:
output_vec(vec_tfidf, stem_data)

Unnamed: 0,abl,accept,actual,address,advanc,ago,agre,allow,american,answer,...,won,wonder,word,work,world,write,wrong,wrote,ye,year
Doc0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.169879,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.121627
Doc1,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.219800,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Doc2,0.000000,0.0,0.099501,0.000000,0.115139,0.0,0.0,0.0,0.0,0.110203,...,0.0,0.112887,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Doc3,0.000000,0.0,0.000000,0.234446,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.098005,0.000000,0.219991,0.000000,0.000000
Doc4,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.207745,0.0,0.000000,0.373991,0.089076,0.000000,0.000000,0.202237,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc11309,0.180338,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.134922
Doc11310,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.209417,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Doc11311,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.190660,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Doc11312,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.057656,0.128766,0.000000,0.000000,0.000000


#### Changing max_df and min_df
- Increasing the range: increases the amount of features
    - Decreasing the minimum frequency that a word need to have to be kept
    - Increasing the maximum frequency that a word can have to stat
    
- Decreasing the range: decreases the amount of features
    - Increasing the minimum frequency required for a word to be kept
    - Decreasing the maximum frequency a word can have to stay

In [9]:
#Vecotrizing: Increasing the range
vec_inc = CountVectorizer(max_df=0.99, min_df=0.02)

#Vectorizing: Decreasing the range
vec_dec = CountVectorizer(max_df=0.80, min_df=0.1)

In [11]:
increased = output_vec(vec_inc, stem_data)

decreased = output_vec(vec_dec, stem_data)

In [14]:
increased.shape # number of features increased to 821

(11314, 821)

In [15]:
decreased.shape #number of features decreased to 58

(11314, 58)

### b. Binarizer():  One-hot encoding
   - This method is a binary representation of whether the word is in the document or not
        - **Advantage:** it's easy to calculate and interpret
        - **Disadvantage:** Does not take into account importance of words



In [21]:
#absolute frequencies
absolute = output_vec(vec_abs, stem_data)

#One-hot encoding on the absolute frequencies
binarized = Binarizer().fit_transform(absolute)



Since Binarizer() doesn't have get_feature_names(), I can't use my function output_vec. So I took parts of it to modify them separately and still get a nice dataframe for inspection

In [23]:
#creates an index for the documents in each row of the DF
doc_nums = ['Doc{:d}'.format(idx) for idx, _ in enumerate(stem_data)]

#feature names according to Absolute frequencies vectorizer
features = vec_abs.get_feature_names()

#creates DF
my_df = pd.DataFrame(data=binarized, index=doc_nums, columns=features)

my_df.head()

Unnamed: 0,abl,accept,actual,address,advanc,ago,agre,allow,american,answer,...,won,wonder,word,work,world,write,wrong,wrote,ye,year
Doc0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
Doc1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Doc2,0,0,1,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
Doc3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
Doc4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,0,1,0


### c. 1st doc frequencies DataFrame

In [35]:
def fitted_doc0(vectorizer):
    """ Fits vectors to stem_data, transfors the output in an array, and selects only the first document
        Will help in the visualization"""
    fitted_vec = vectorizer.fit_transform(stem_data)
    value = fitted_vec.toarray()[0]
    return value

In [39]:
# Getting the values that will the used in the DF columns
values_abs = fitted_doc0(vec_abs)
values_rel = fitted_doc0(vec_rel)
values_tfidf = fitted_doc0(vec_tfidf)

In [156]:
# Used a dictionary to easily transform all the values to a DF indexed by the feature names
df_doc = pd.DataFrame({"Tokens":features,
                       "Absolute":values_abs, 
                        "Relative":values_rel, 
                        "TF-IDF": values_tfidf, 
                        "One-hot": binarized[0]}).set_index(keys="Tokens")


#Filtered out the words that do not appear in the first document by maintaining only those which have 1 in Binarizer
df_doc0 = df_doc[df_doc["One-hot"]==1]

In [157]:
df_doc0

Unnamed: 0_level_0,Absolute,Relative,TF-IDF,One-hot
Tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
call,1,0.058824,0.149044,1
car,5,0.294118,0.850999,1
dai,1,0.058824,0.13996,1
engin,1,0.058824,0.171575,1
info,1,0.058824,0.173199,1
know,1,0.058824,0.097303,1
look,2,0.117647,0.236634,1
mail,1,0.058824,0.139112,1
small,1,0.058824,0.176711,1
thank,1,0.058824,0.121935,1


### d. Sorting the df_doc1 DataFrame by frequencies
By sorting by different frequencies, one can see that the orders of the features changes. Te words small and info lost importance, for instance. Because TF-IDF artificially decreases importance of features that appear in many documents and increase importance of those that are rarer, and thus differentiate the documents from each other. 


In [54]:
# i. by TF-IDF Frequencies
df_doc0.sort_values(by = ["TF-IDF"], ascending=False)

Unnamed: 0,Absolute,Relative,TF-IDF,One-hot
car,5,0.294118,0.850999,1
look,2,0.117647,0.236634,1
small,1,0.058824,0.176711,1
info,1,0.058824,0.173199,1
engin,1,0.058824,0.171575,1
wonder,1,0.058824,0.169879,1
call,1,0.058824,0.149044,1
dai,1,0.058824,0.13996,1
mail,1,0.058824,0.139112,1
thank,1,0.058824,0.121935,1


In [55]:
# ii. by Absolute Frequencies
df_doc0.sort_values(by = ["Absolute"], ascending=False)

Unnamed: 0,Absolute,Relative,TF-IDF,One-hot
car,5,0.294118,0.850999,1
look,2,0.117647,0.236634,1
call,1,0.058824,0.149044,1
dai,1,0.058824,0.13996,1
engin,1,0.058824,0.171575,1
info,1,0.058824,0.173199,1
know,1,0.058824,0.097303,1
mail,1,0.058824,0.139112,1
small,1,0.058824,0.176711,1
thank,1,0.058824,0.121935,1


## Bag-of-words (gensim)

### a. Corpus

Creation of curpus with gensim. This package creates a list of list with the features in each document

In [63]:
#Creation of corpus with gensim
corpus_gen = [doc.split() for doc in stem_data]
corpus_gen

[['car',
  'wonder',
  'enlighten',
  'car',
  'saw',
  'dai',
  'door',
  'sport',
  'car',
  'look',
  'late',
  'earli',
  'call',
  'bricklin',
  'door',
  'small',
  'addit',
  'bumper',
  'separ',
  'rest',
  'bodi',
  'know',
  'tellm',
  'model',
  'engin',
  'spec',
  'year',
  'product',
  'car',
  'histori',
  'info',
  'funki',
  'look',
  'car',
  'mail',
  'thank'],
 ['clock',
  'poll',
  'final',
  'final',
  'clock',
  'report',
  'acceler',
  'clock',
  'upgrad',
  'fair',
  'number',
  'brave',
  'soul',
  'upgrad',
  'clock',
  'oscil',
  'share',
  'experi',
  'poll',
  'send',
  'brief',
  'messag',
  'detail',
  'experi',
  'procedur',
  'speed',
  'attain',
  'cpu',
  'rate',
  'speed',
  'add',
  'card',
  'adapt',
  'heat',
  'sink',
  'hour',
  'usag',
  'dai',
  'floppi',
  'disk',
  'function',
  'floppi',
  'especi',
  'request',
  'summar',
  'dai',
  'add',
  'network',
  'knowledg',
  'base',
  'clock',
  'upgrad',
  'haven',
  'answer',
  'poll',
  'tha

### b. Dictionary()

In [73]:
#Creating a Dictionary out of the corpus
id2word = Dictionary(corpus_gen)

print(id2word)

Dictionary(72291 unique tokens: ['addit', 'bodi', 'bricklin', 'bumper', 'call']...)


In [76]:
#Filtering out words that appear less than 566 times and those which appear in more than 95% of the documents
id2word.filter_extremes(no_below=566, no_above=0.95)
print(id2word)

Dictionary(241 unique tokens: ['call', 'car', 'dai', 'engin', 'info']...)


### c. Dictionary operations

In [77]:
#i. Prints the tokens as key-value pairs with their respective ids
print(id2word.token2id)

{'call': 0, 'car': 1, 'dai': 2, 'engin': 3, 'info': 4, 'know': 5, 'look': 6, 'mail': 7, 'small': 8, 'thank': 9, 'wonder': 10, 'year': 11, 'answer': 12, 'base': 13, 'card': 14, 'edu': 15, 'experi': 16, 'final': 17, 'gui': 18, 'messag': 19, 'number': 20, 'report': 21, 'send': 22, 'actual': 23, 'advanc': 24, 'anybodi': 25, 'better': 26, 'bit': 27, 'email': 28, 'expect': 29, 'feel': 30, 'good': 31, 'got': 32, 'great': 33, 'heard': 34, 'help': 35, 'life': 36, 'like': 37, 'line': 38, 'machin': 39, 'mayb': 40, 'new': 41, 'opinion': 42, 'peopl': 43, 'plai': 44, 'post': 45, 'price': 46, 'probabl': 47, 'question': 48, 'read': 49, 'real': 50, 'recent': 51, 'start': 52, 'take': 53, 'time': 54, 'us': 55, 'wai': 56, 'address': 57, 'articl': 58, 'chip': 59, 'com': 60, 'far': 61, 'inform': 62, 'person': 63, 'phone': 64, 'point': 65, 'pretti': 66, 'requir': 67, 'stuff': 68, 'system': 69, 'thing': 70, 'write': 71, 'wrote': 72, 'check': 73, 'mean': 74, 'possibl': 75, 'right': 76, 'set': 77, 'softwar': 78

In [78]:
# ii. Prints only the tokens - feature names
print(id2word.token2id.keys())

dict_keys(['call', 'car', 'dai', 'engin', 'info', 'know', 'look', 'mail', 'small', 'thank', 'wonder', 'year', 'answer', 'base', 'card', 'edu', 'experi', 'final', 'gui', 'messag', 'number', 'report', 'send', 'actual', 'advanc', 'anybodi', 'better', 'bit', 'email', 'expect', 'feel', 'good', 'got', 'great', 'heard', 'help', 'life', 'like', 'line', 'machin', 'mayb', 'new', 'opinion', 'peopl', 'plai', 'post', 'price', 'probabl', 'question', 'read', 'real', 'recent', 'start', 'take', 'time', 'us', 'wai', 'address', 'articl', 'chip', 'com', 'far', 'inform', 'person', 'phone', 'point', 'pretti', 'requir', 'stuff', 'system', 'thing', 'write', 'wrote', 'check', 'mean', 'possibl', 'right', 'set', 'softwar', 'tell', 'understand', 'world', 'ye', 'agre', 'allow', 'apr', 'believ', 'come', 'consid', 'control', 'cost', 'cours', 'exist', 'follow', 'given', 'govern', 'hand', 'hard', 'hope', 'idea', 'john', 'kill', 'make', 'need', 'non', 'power', 'reason', 'result', 'sai', 'second', 'state', 'support', 't

In [79]:
#iii. Prints the token ids and how many documents contain this token
print(id2word.dfs)

{1: 697, 10: 702, 2: 1366, 6: 2211, 0: 1116, 8: 603, 5: 3529, 3: 676, 11: 2054, 4: 652, 7: 1392, 9: 2040, 17: 577, 21: 601, 20: 1177, 16: 628, 22: 743, 19: 821, 14: 703, 13: 898, 12: 768, 18: 595, 15: 5742, 48: 1776, 52: 1231, 36: 730, 56: 2219, 41: 2934, 39: 611, 27: 1003, 40: 809, 25: 568, 29: 582, 34: 727, 46: 615, 38: 992, 37: 3854, 51: 630, 47: 1117, 32: 1224, 30: 707, 26: 1217, 33: 965, 31: 2338, 42: 1062, 43: 2554, 55: 2517, 53: 639, 50: 930, 44: 772, 23: 1099, 35: 1627, 24: 651, 28: 777, 45: 1803, 49: 1496, 54: 2828, 72: 809, 71: 6082, 58: 4991, 59: 583, 61: 959, 68: 572, 66: 654, 67: 731, 65: 1571, 57: 637, 64: 716, 62: 1112, 60: 3777, 69: 603, 70: 2072, 63: 1247, 81: 1024, 82: 776, 74: 1486, 80: 730, 78: 828, 73: 652, 76: 1856, 77: 996, 75: 1206, 79: 1213, 109: 869, 99: 955, 85: 2843, 100: 746, 105: 966, 102: 1109, 90: 588, 103: 2183, 89: 927, 95: 847, 107: 686, 104: 749, 92: 831, 110: 1359, 87: 1671, 86: 1434, 97: 849, 111: 968, 83: 662, 96: 787, 98: 771, 91: 1026, 108: 1289

In [80]:
# iv. Convert document into the bag-of-words (BoW) format = list of (token_id, token_count) tuples.
corpus1=[id2word.doc2bow(doc) for doc in corpus_gen]
corpus1

[[(0, 1),
  (1, 5),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 2),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1)],
 [(2, 2),
  (9, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 2),
  (17, 2),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1)],
 [(2, 2),
  (3, 1),
  (4, 2),
  (5, 1),
  (6, 2),
  (9, 1),
  (10, 1),
  (12, 1),
  (15, 1),
  (17, 2),
  (23, 1),
  (24, 1),
  (25, 3),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 3),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 3),
  (40, 1),
  (41, 2),
  (42, 2),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 3),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 2),
  (56, 1)],
 [(5, 1),
  (6, 1),
  (15, 2),
  (20, 1),
  (32, 1),
  (37, 1),
  (57, 1),
  (58, 1),
  (59, 2),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1)],
 [(10, 1),
  (1

In [81]:
# v. Applies relative frequencies to corpus 1
corpus2=[[(token[0],(token[1]/sum(n for _, n in doc))) for token in doc] for doc in corpus1]
corpus2

[[(0, 0.058823529411764705),
  (1, 0.29411764705882354),
  (2, 0.058823529411764705),
  (3, 0.058823529411764705),
  (4, 0.058823529411764705),
  (5, 0.058823529411764705),
  (6, 0.11764705882352941),
  (7, 0.058823529411764705),
  (8, 0.058823529411764705),
  (9, 0.058823529411764705),
  (10, 0.058823529411764705),
  (11, 0.058823529411764705)],
 [(2, 0.125),
  (9, 0.0625),
  (12, 0.0625),
  (13, 0.0625),
  (14, 0.0625),
  (15, 0.0625),
  (16, 0.125),
  (17, 0.125),
  (18, 0.0625),
  (19, 0.0625),
  (20, 0.0625),
  (21, 0.0625),
  (22, 0.0625)],
 [(2, 0.03389830508474576),
  (3, 0.01694915254237288),
  (4, 0.03389830508474576),
  (5, 0.01694915254237288),
  (6, 0.03389830508474576),
  (9, 0.01694915254237288),
  (10, 0.01694915254237288),
  (12, 0.01694915254237288),
  (15, 0.01694915254237288),
  (17, 0.03389830508474576),
  (23, 0.01694915254237288),
  (24, 0.01694915254237288),
  (25, 0.05084745762711865),
  (26, 0.01694915254237288),
  (27, 0.01694915254237288),
  (28, 0.016949152

In [82]:
# vi. One-hot encoding
corpus3=[[(token[0],1) for token in doc] for doc in corpus1]
corpus3

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1)],
 [(2, 1),
  (9, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1)],
 [(2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (9, 1),
  (10, 1),
  (12, 1),
  (15, 1),
  (17, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1)],
 [(5, 1),
  (6, 1),
  (15, 1),
  (20, 1),
  (32, 1),
  (37, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1)],
 [(10, 1),
  (1

In [83]:
# TF-IDF frequencies
tfidf=TfidfModel(dictionary=id2word, normalize=True)
corpus4=[tfidf[id2word.doc2bow(doc)] for doc in corpus_gen]
corpus4

[[(0, 0.14401571742494348),
  (1, 0.8664142855629612),
  (2, 0.13144788445203487),
  (3, 0.17518494060358739),
  (4, 0.17743248047131693),
  (5, 0.07243570690519817),
  (6, 0.20301385417692416),
  (7, 0.1302755840836711),
  (8, 0.18229006237583106),
  (9, 0.10651172292550919),
  (10, 0.1728384294443815),
  (11, 0.10608648808537256)],
 [(2, 0.3473077099950688),
  (9, 0.14071106100755207),
  (12, 0.22095361016902754),
  (13, 0.208108734014589),
  (14, 0.22821736806951118),
  (15, 0.055709197254459546),
  (16, 0.47496797260691126),
  (17, 0.4888819220105042),
  (18, 0.2419177311111727),
  (19, 0.21547221793927493),
  (20, 0.1858857791905979),
  (21, 0.24109359016572873),
  (22, 0.223671882443836)],
 [(2, 0.17729530237829488),
  (3, 0.1181436549774607),
  (4, 0.23931876429986967),
  (5, 0.04885019873951971),
  (6, 0.13691130448133904),
  (9, 0.07183085601427869),
  (10, 0.11656118216993042),
  (12, 0.11279345663547127),
  (15, 0.02843869769727173),
  (17, 0.24956678385102643),
  (23, 0.097

### d. Frequencies DataFrame

In [123]:
def get_frequencies(corpus, pos=1):
    """Gets the frequencies from the gensim dictionary. 
    Returns a list with either the frequencies(pos=1) or the token_ids (pos=0)"""
    freq = []
    for i in range(len(corpus[0])):
        freq.append(corpus[0][i][pos])
    return freq

In [146]:
#DF columns: frequencies and token info
abs_freq = get_frequencies(corpus1)
rel_freq = get_frequencies(corpus2)
bin_freq = get_frequencies(corpus3)
tfidf_freq = get_frequencies(corpus4)
token_ids = get_frequencies(corpus1, pos=0)
token_values = [id2word.id2token[i] for i in range(12)]

In [154]:
#DataFrame with all frequencies
gen_df = pd.DataFrame(data={"Tokens": token_values,
                   "Token_id":token_ids,
                   "Absolute":abs_freq,
                   "Relative":rel_freq,
                   "TF-IDF": tfidf_freq,
                   "One-hot":bin_freq}).set_index(keys="Tokens")
gen_df

Unnamed: 0_level_0,Token_id,Absolute,Relative,TF-IDF,One-hot
Tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
call,0,1,0.058824,0.144016,1
car,1,5,0.294118,0.866414,1
dai,2,1,0.058824,0.131448,1
engin,3,1,0.058824,0.175185,1
info,4,1,0.058824,0.177432,1
know,5,1,0.058824,0.072436,1
look,6,2,0.117647,0.203014,1
mail,7,1,0.058824,0.130276,1
small,8,1,0.058824,0.18229,1
thank,9,1,0.058824,0.106512,1


In [158]:
#Only difference lies in the TF-IDF. Sklearn uses natural logarithm, whereas gensim uses logarithm base 2
gen_df.merge(df_doc0, on="Tokens")

Unnamed: 0_level_0,Token_id,Absolute_x,Relative_x,TF-IDF_x,One-hot_x,Absolute_y,Relative_y,TF-IDF_y,One-hot_y
Tokens,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
call,0,1,0.058824,0.144016,1,1,0.058824,0.149044,1
car,1,5,0.294118,0.866414,1,5,0.294118,0.850999,1
dai,2,1,0.058824,0.131448,1,1,0.058824,0.13996,1
engin,3,1,0.058824,0.175185,1,1,0.058824,0.171575,1
info,4,1,0.058824,0.177432,1,1,0.058824,0.173199,1
know,5,1,0.058824,0.072436,1,1,0.058824,0.097303,1
look,6,2,0.117647,0.203014,1,2,0.117647,0.236634,1
mail,7,1,0.058824,0.130276,1,1,0.058824,0.139112,1
small,8,1,0.058824,0.18229,1,1,0.058824,0.176711,1
thank,9,1,0.058824,0.106512,1,1,0.058824,0.121935,1


## Ngrams

In [160]:
#Only 3 features! Because there are not many sequences of words appearing often in the documents
n_vec = CountVectorizer(ngram_range=(2, 2), max_df=0.95, min_df=0.05)
n_df = output_vec(n_vec,stem_data)
n_df

Unnamed: 0,articl apr,edu write,write articl
Doc0,0,0,0
Doc1,0,0,0
Doc2,0,0,0
Doc3,0,1,1
Doc4,0,0,0
...,...,...,...
Doc11309,0,0,0
Doc11310,0,0,0
Doc11311,0,0,0
Doc11312,0,0,0


In [163]:
#In over 11 thousad document, the max repetition is 4
n_df.describe()

Unnamed: 0,articl apr,edu write,write articl
count,11314.0,11314.0,11314.0
mean,0.206735,0.05312,0.125243
std,0.470038,0.230119,0.400602
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,4.0,2.0,4.0
