# Exercise 3. Text Representation Part 1

In this exercise we will derive the document representation of the preprocessed (stemmed) newsgroups dataset. We will apply sklearn as well as gensim to derive the Bag-of-words document representation. Those two packages are the standard packages for deriving the  Bag-of-words document representation.

We will calculate the following representations for each package:

* Absolute frequencies
* Relative frequencies
* TF-IDF frequences

Finally, we will derive N-grams for the dataset.

In [None]:
# Import packages
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Binarizer
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import dataset
data_stem=pickle.load(open("/content/drive/MyDrive/TWSM_Data/Stemmed.pkl", "rb"))
print(data_stem[0])
print(' ')

car wonder enlighten car saw dai door sport car look late earli call bricklin door small addit bumper separ rest bodi know tellm model engin spec year product car histori info funki look car mail thank
 


In [None]:
len(data_stem)

11314

# 1. Bag-of-words with sklearn

Apply the sklearn tranformations by removing all words that appear in less than min_df of the documents and more than max_df of the documents.

In [None]:
# Set transformers
vec_bin=Binarizer() # One-hot
vec_abs= CountVectorizer(max_df=0.95, min_df=0.05) #Absolute frequency
vec_rel = TfidfVectorizer(max_df=0.95, min_df=0.05, use_idf=False, norm='l1') # Relative frequency
vec_tf=TfidfVectorizer(max_df=0.95, min_df=0.05, smooth_idf=False) #Tf-IDF frequency

# Tranform stemmed data
corpus_sk_abs=vec_abs.fit_transform(data_stem)
corpus_sk_bin=vec_bin.fit_transform(corpus_sk_abs)
corpus_sk_rel=vec_rel.fit_transform(data_stem)
corpus_sk_tf=vec_tf.fit_transform(data_stem)

Show words in dictionary

In [None]:
print('The number of features is: ',len(vec_tf.get_feature_names()))
print('')
print(vec_tf.get_feature_names())
print(vec_abs.get_feature_names())
print(vec_rel.get_feature_names())

The number of features is:  240

['abl', 'accept', 'actual', 'address', 'advanc', 'ago', 'agre', 'allow', 'american', 'answer', 'anybodi', 'appreci', 'apr', 'area', 'articl', 'ask', 'assum', 'avail', 'awai', 'bad', 'base', 'believ', 'best', 'better', 'big', 'bit', 'book', 'bui', 'call', 'car', 'card', 'care', 'case', 'caus', 'chang', 'check', 'chip', 'christian', 'claim', 'close', 'com', 'come', 'complet', 'consid', 'control', 'correct', 'cost', 'cours', 'current', 'dai', 'data', 'david', 'deal', 'design', 'differ', 'discuss', 'drive', 'edu', 'effect', 'email', 'end', 'engin', 'exampl', 'exist', 'expect', 'experi', 'fact', 'far', 'fax', 'feel', 'file', 'final', 'follow', 'forc', 'free', 'game', 'gener', 'get', 'given', 'go', 'god', 'good', 'got', 'govern', 'great', 'group', 'guess', 'gui', 'hand', 'happen', 'hard', 'have', 'heard', 'help', 'high', 'home', 'hope', 'human', 'idea', 'import', 'includ', 'info', 'inform', 'interest', 'internet', 'isn', 'issu', 'john', 'kei', 'kill', 'kind',

In [None]:
type(corpus_sk_rel)

scipy.sparse.csr.csr_matrix

In [None]:
corpus_sk_bin.toarray()[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])

In [None]:
# Put the frequencies of the first document in a data frame for the first text in the corpus
df_1=pd.DataFrame({'keys':vec_tf.get_feature_names(), 'bin': corpus_sk_bin.toarray()[0],'abs':corpus_sk_abs.toarray()[0],'rel': corpus_sk_rel.toarray()[0],'tf': corpus_sk_tf.toarray()[0]})

# Show only those with existing words
df_1=df_1[df_1.bin>0]
#print(df_1[df_1.rel>0])
print(df_1)
df_1.shape

       keys  bin  abs       rel        tf
28     call    1    1  0.058824  0.149013
29      car    1    5  0.294118  0.850820
49      dai    1    1  0.058824  0.140095
61    engin    1    1  0.058824  0.171939
101    info    1    1  0.058824  0.173232
111    know    1    1  0.058824  0.097346
124    look    1    2  0.117647  0.236706
128    mail    1    1  0.058824  0.139212
191   small    1    1  0.058824  0.176748
208   thank    1    1  0.058824  0.121997
231  wonder    1    1  0.058824  0.169843
239    year    1    1  0.058824  0.121843


(12, 5)

In [None]:
# Number of times car appears in all documents
sum(corpus_sk_bin.toarray()[:,29])

697

In [None]:
# idf of car
import numpy as np
np.log(11314/697)+1

3.7870107651425773

In [None]:
# model output
vec_tf.idf_[29]

3.7870107651425773

In [None]:
# tfidf weights are normalised
sum(corpus_sk_tf.toarray()[0,:]**2)

0.9999999999999997

In [None]:
nor=math.sqrt(sum((corpus_sk_abs.toarray()[0,:]*vec_tf.idf_)**2))

In [None]:
nor

22.255063672543734

In [None]:
corpus_sk_abs.toarray()[0,29]*vec_tf.idf_[29]/nor

0.8508200247961197

In [None]:
print(df_1.sort_values(by=['tf'], ascending=False)[0:10])

       keys  bin  abs       rel        tf
29      car    1    5  0.294118  0.850820
124    look    1    2  0.117647  0.236706
191   small    1    1  0.058824  0.176748
101    info    1    1  0.058824  0.173232
61    engin    1    1  0.058824  0.171939
231  wonder    1    1  0.058824  0.169843
28     call    1    1  0.058824  0.149013
49      dai    1    1  0.058824  0.140095
128    mail    1    1  0.058824  0.139212
208   thank    1    1  0.058824  0.121997


In [None]:
print(df_1.sort_values(by=['abs'], ascending=False)[0:10])

      keys  bin  abs       rel        tf
29     car    1    5  0.294118  0.850820
124   look    1    2  0.117647  0.236706
28    call    1    1  0.058824  0.149013
49     dai    1    1  0.058824  0.140095
61   engin    1    1  0.058824  0.171939
101   info    1    1  0.058824  0.173232
111   know    1    1  0.058824  0.097346
128   mail    1    1  0.058824  0.139212
191  small    1    1  0.058824  0.176748
208  thank    1    1  0.058824  0.121997


# 2. Bag-of-words with gensim

As opposed to sklearn, gensim requires the data to be already split into tokens.

In [None]:
corpus_gen=[doc.split() for doc in data_stem]
corpus_gen[0]

['car',
 'wonder',
 'enlighten',
 'car',
 'saw',
 'dai',
 'door',
 'sport',
 'car',
 'look',
 'late',
 'earli',
 'call',
 'bricklin',
 'door',
 'small',
 'addit',
 'bumper',
 'separ',
 'rest',
 'bodi',
 'know',
 'tellm',
 'model',
 'engin',
 'spec',
 'year',
 'product',
 'car',
 'histori',
 'info',
 'funki',
 'look',
 'car',
 'mail',
 'thank']

Gensim assigns to each token (word) found in the data a unique id. In this way, it is possible to use those ids instead of the words for all operations.

In [None]:
# Create a gensim dictionary 
id2word=Dictionary(corpus_gen)

# Remove common and rare words
id2word.filter_extremes(no_below=566, no_above=0.95)
                        
# Show the tokens and their ids
print(id2word.token2id)

{'call': 0, 'car': 1, 'dai': 2, 'engin': 3, 'info': 4, 'know': 5, 'look': 6, 'mail': 7, 'small': 8, 'thank': 9, 'wonder': 10, 'year': 11, 'answer': 12, 'base': 13, 'card': 14, 'edu': 15, 'experi': 16, 'final': 17, 'gui': 18, 'messag': 19, 'number': 20, 'report': 21, 'send': 22, 'actual': 23, 'advanc': 24, 'anybodi': 25, 'better': 26, 'bit': 27, 'email': 28, 'expect': 29, 'feel': 30, 'good': 31, 'got': 32, 'great': 33, 'heard': 34, 'help': 35, 'life': 36, 'like': 37, 'line': 38, 'machin': 39, 'mayb': 40, 'new': 41, 'opinion': 42, 'peopl': 43, 'plai': 44, 'post': 45, 'price': 46, 'probabl': 47, 'question': 48, 'read': 49, 'real': 50, 'recent': 51, 'start': 52, 'take': 53, 'time': 54, 'us': 55, 'wai': 56, 'address': 57, 'articl': 58, 'chip': 59, 'com': 60, 'far': 61, 'inform': 62, 'person': 63, 'phone': 64, 'point': 65, 'pretti': 66, 'requir': 67, 'stuff': 68, 'system': 69, 'thing': 70, 'write': 71, 'wrote': 72, 'check': 73, 'mean': 74, 'possibl': 75, 'right': 76, 'set': 77, 'softwar': 78

In [None]:
len(id2word.token2id.keys())

240

In [None]:
# Print all features
print(id2word.token2id.keys())

dict_keys(['call', 'car', 'dai', 'engin', 'info', 'know', 'look', 'mail', 'small', 'thank', 'wonder', 'year', 'answer', 'base', 'card', 'edu', 'experi', 'final', 'gui', 'messag', 'number', 'report', 'send', 'actual', 'advanc', 'anybodi', 'better', 'bit', 'email', 'expect', 'feel', 'good', 'got', 'great', 'heard', 'help', 'life', 'like', 'line', 'machin', 'mayb', 'new', 'opinion', 'peopl', 'plai', 'post', 'price', 'probabl', 'question', 'read', 'real', 'recent', 'start', 'take', 'time', 'us', 'wai', 'address', 'articl', 'chip', 'com', 'far', 'inform', 'person', 'phone', 'point', 'pretti', 'requir', 'stuff', 'system', 'thing', 'write', 'wrote', 'check', 'mean', 'possibl', 'right', 'set', 'softwar', 'tell', 'understand', 'world', 'ye', 'agre', 'allow', 'apr', 'believ', 'come', 'consid', 'control', 'cost', 'cours', 'exist', 'follow', 'given', 'govern', 'hand', 'hard', 'hope', 'idea', 'john', 'kill', 'make', 'need', 'non', 'power', 'reason', 'result', 'sai', 'second', 'state', 'support', 't

In [None]:
# Show how many times tokes appear in all documents
print(id2word.dfs) #car appears 697 times

{1: 697, 10: 702, 2: 1361, 6: 2208, 0: 1116, 8: 602, 5: 3524, 3: 670, 11: 2043, 4: 651, 7: 1388, 9: 2036, 17: 576, 21: 601, 20: 1174, 16: 627, 22: 740, 19: 819, 14: 703, 13: 895, 12: 767, 18: 595, 15: 5726, 48: 1775, 52: 1229, 36: 725, 56: 2218, 41: 2924, 39: 609, 27: 1002, 40: 806, 25: 567, 29: 581, 34: 726, 46: 613, 38: 990, 37: 3848, 51: 629, 47: 1116, 32: 1223, 30: 705, 26: 1215, 33: 965, 31: 2334, 42: 1059, 43: 2549, 55: 2514, 53: 638, 50: 928, 44: 769, 23: 1095, 35: 1621, 24: 650, 28: 775, 45: 1800, 49: 1488, 54: 2822, 72: 808, 71: 6081, 58: 4988, 59: 582, 61: 958, 68: 571, 66: 651, 67: 729, 65: 1570, 57: 632, 64: 716, 62: 1112, 60: 3759, 69: 599, 70: 2070, 63: 1244, 81: 1023, 82: 772, 74: 1484, 80: 729, 78: 824, 73: 650, 76: 1852, 77: 990, 75: 1206, 79: 1210, 109: 865, 99: 953, 85: 2841, 100: 743, 105: 963, 102: 1108, 90: 587, 103: 2175, 89: 926, 95: 846, 107: 686, 104: 748, 92: 829, 110: 1354, 87: 1668, 86: 1434, 97: 847, 111: 964, 83: 662, 96: 787, 98: 771, 91: 1025, 108: 1287

In [None]:
# Absolute frequencies
corpus_gen_abs=[id2word.doc2bow(doc) for doc in corpus_gen]

In [None]:
corpus_gen_abs[0]

[(0, 1),
 (1, 5),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 2),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1)]

In [None]:
# Relative frequencies
corpus_gen_rel=[[(token[0],(token[1]/sum(n for _, n in doc))) for token in doc] for doc in corpus_gen_abs]

                  
# Binary frequencies
corpus_gen_bin=[[(token[0],1) for token in doc] for doc in corpus_gen_abs]

# Tfidf frequencies
tfidf=TfidfModel(dictionary=id2word, normalize=True)

# Get sklearn

#from gensim.models.tfidfmodel import df2idf
#from math import e
#def d2f(docfreq, totaldocs):
#    return df2idf(docfreq, totaldocs, log_base=e, add=1.0)
#tfidf=TfidfModel(dictionary=id2word, normalize=True, wglobal=d2f)

corpus_gen_tf=[tfidf[id2word.doc2bow(doc)] for doc in corpus_gen]

In [None]:
# idf term for car (id=1)
math.log2(11314/697)
np.log(11314/697)+1

4.020806609775351

In [None]:
tfidf.idfs[1]

4.020806609775351

In [None]:
len(corpus_gen_tf[0])

12

In [None]:
sum([corpus_gen_tf[0][i][1]**2 for i in range(len(corpus_gen_tf[0]))])

1.0000000000000004

In [None]:
# normalisation
nor2=math.sqrt(sum([(corpus_gen_abs[0][i][1]*tfidf.idfs[i])**2 for i in range(len(corpus_gen_abs[0]))]))

In [None]:
nor2

23.210110013268068

In [None]:
(corpus_gen_abs[0][1][1]*tfidf.idfs[1])/nor2

0.8661756897052311

In [None]:
print('binary')
print(corpus_gen_bin[0])
print(' ')
print('absolute')
print(corpus_gen_abs[0])
print(' ')
print('relative')
print(corpus_gen_rel[0])
print(' ')
print('tf-idf')
print(corpus_gen_tf[0])

binary
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]
 
absolute
[(0, 1), (1, 5), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]
 
relative
[(0, 0.058823529411764705), (1, 0.29411764705882354), (2, 0.058823529411764705), (3, 0.058823529411764705), (4, 0.058823529411764705), (5, 0.058823529411764705), (6, 0.11764705882352941), (7, 0.058823529411764705), (8, 0.058823529411764705), (9, 0.058823529411764705), (10, 0.058823529411764705), (11, 0.058823529411764705)]
 
tf-idf
[(0, 0.14397605792925167), (1, 0.8661756897052311), (2, 0.1316396218024463), (3, 0.17569085919746014), (4, 0.17747902617106662), (5, 0.07250388928623666), (6, 0.20312674067620154), (7, 0.13041858058126876), (8, 0.18234302967391078), (9, 0.1066043895592759), (10, 0.17279083266070747), (11, 0.10639104965460928)]


In [None]:
len(corpus_gen_abs[0])

12

In [None]:
k=[list(id2word.token2id.keys())[i] for i in range(len(corpus_gen_bin[0]))]
tf=[corpus_gen_tf[0][i][1] for i in range(len(corpus_gen_bin[0]))]
rel=[corpus_gen_rel[0][i][1] for i in range(len(corpus_gen_bin[0]))]
ab=[corpus_gen_abs[0][i][1] for i in range(len(corpus_gen_abs[0]))]
bi=[corpus_gen_bin[0][i][1] for i in range(len(corpus_gen_bin[0]))]


In [None]:
# Put the frequencies in a data farme for the first text in the corpus
df_12=pd.DataFrame({'keys':k, 'bin': bi,'abs':ab,'rel':rel,'tf': tf})
print(df_12)
df_12.shape

      keys  bin  abs       rel        tf
0     call    1    1  0.058824  0.143976
1      car    1    5  0.294118  0.866176
2      dai    1    1  0.058824  0.131640
3    engin    1    1  0.058824  0.175691
4     info    1    1  0.058824  0.177479
5     know    1    1  0.058824  0.072504
6     look    1    2  0.117647  0.203127
7     mail    1    1  0.058824  0.130419
8    small    1    1  0.058824  0.182343
9    thank    1    1  0.058824  0.106604
10  wonder    1    1  0.058824  0.172791
11    year    1    1  0.058824  0.106391


(12, 5)

In [None]:
df_merge=pd.merge(df_12, df_1, on='keys')
df_merge

Unnamed: 0,keys,bin_x,abs_x,rel_x,tf_x,bin_y,abs_y,rel_y,tf_y
0,call,1,1,0.058824,0.143976,1,1,0.058824,0.149013
1,car,1,5,0.294118,0.866176,1,5,0.294118,0.85082
2,dai,1,1,0.058824,0.13164,1,1,0.058824,0.140095
3,engin,1,1,0.058824,0.175691,1,1,0.058824,0.171939
4,info,1,1,0.058824,0.177479,1,1,0.058824,0.173232
5,know,1,1,0.058824,0.072504,1,1,0.058824,0.097346
6,look,1,2,0.117647,0.203127,1,2,0.117647,0.236706
7,mail,1,1,0.058824,0.130419,1,1,0.058824,0.139212
8,small,1,1,0.058824,0.182343,1,1,0.058824,0.176748
9,thank,1,1,0.058824,0.106604,1,1,0.058824,0.121997


# 3. Ngrams

In [None]:
# Two-gram absolute transformer (min=max=2 words)
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_df=0.95, min_df=0.05)

In [None]:
# Trasform data
corpus_sk_bi=bigram_vectorizer.fit_transform(data_stem)
print(bigram_vectorizer.get_feature_names())

['articl apr', 'edu write', 'write articl']


You can see that there are much less words than for one-grams. The reason is that combination of words come less rarely over all documents than single words. However, if you remove the document frequency filtering, you will get much more words than for one-grams.

In [None]:
corpus_sk_bi_df=pd.DataFrame(corpus_sk_bi.toarray(), columns=bigram_vectorizer.get_feature_names())
corpus_sk_bi_df.head()

Unnamed: 0,articl apr,edu write,write articl
0,0,0,0
1,0,0,0
2,0,0,0
3,0,1,1
4,0,0,0


In [None]:
corpus_sk_bi_df.max()

articl apr      4
edu write       2
write articl    4
dtype: int64

The maximum frequency is 4 which is much less than for one-grams.