# Introduction to the Vector Space Model

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

## Example of texts

In [2]:
import pymongo
from IPython.display import display

In [None]:
db = pymongo.MongoClient()['nyt']
data = db['nyt_key']

### TOKENIZE

In [None]:
text = """
TO REVISE THE CHARTER; Governor Soon to Announce His Choice of Commissioners.
The Commissioners declared that
"""

In [None]:
import nltk
from nltk.stem.porter import *
from collections import Counter

In [None]:
tokens = nltk.word_tokenize(text)
print(tokens)

In [None]:
stemmer = PorterStemmer()
norm = [stemmer.stem(x.lower()) for x in tokens]
print(norm)

In [None]:
bag = dict(Counter(norm).most_common())
print(bag)

In [None]:
i = pd.Series(bag).to_frame()
display(i.T)

## SpaCy

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [5]:
tokenize = lambda x: [t.lemma_ for t in nlp(x.lower()) if t.pos_ not in ['SPACE', 'PUNCT', 'DET']]

## 20 news

In [11]:
from collections import defaultdict

In [6]:
from sklearn.datasets import fetch_20newsgroups

In [7]:
data_train = fetch_20newsgroups(subset='train', remove=['headers', 'footers', 'quotes'], data_home='/tmp/')

In [8]:
corpus = data_train.data[:1000]

In [12]:
I = defaultdict(lambda: defaultdict(lambda: 0))
for i, doc in tqdm(list(enumerate(corpus))):
    for t in tokenize(doc):
        I[i][t] += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [16]:
m = pd.DataFrame(I).T

In [19]:
m.fillna(0, inplace=True)
m.head()

Unnamed: 0,i,be,wonder,if,anyone,out,there,could,enlighten,-PRON-,...,timer,macine,tantrumy,stair,akron,afoul,ye,colossians,3:12,angrily
0,3.0,8.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.0,12.0,1.0,4.0,0.0,1.0,1.0,3.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
Tf = (m.T / m.max(axis=1)).T

In [33]:
Tf.loc[10].sort_values(ascending=False)[:10]

-PRON-    1.000000
and       0.833333
i         0.666667
be        0.666667
out       0.500000
/         0.333333
with      0.333333
to        0.333333
of        0.333333
bike      0.333333
Name: 10, dtype: float64

In [34]:
data_train.target_names[data_train.target[10]]

'rec.motorcycles'

In [41]:
Idf = np.log(1000 / np.count_nonzero(m, axis=0))

In [42]:
TfIdf = Tf*Idf

In [43]:
TfIdf

Unnamed: 0,i,be,wonder,if,anyone,out,there,could,enlighten,-PRON-,...,timer,macine,tantrumy,stair,akron,afoul,ye,colossians,3:12,angrily
0,0.159243,0.166055,0.399273,0.234123,0.523893,0.182090,0.155599,0.229856,0.690183,0.081848,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.141549,0.110703,0.000000,0.312164,0.000000,0.000000,0.000000,0.000000,0.000000,0.054565,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.283099,0.166055,0.266182,0.312164,0.000000,0.121393,0.103733,0.459713,0.000000,0.122772,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.163696,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.166055,0.000000,0.234123,0.000000,0.000000,0.000000,0.000000,0.000000,0.163696,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.424648,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
996,0.070775,0.055352,0.000000,0.156082,0.000000,0.242786,0.000000,0.000000,0.000000,0.163696,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
997,0.424648,0.166055,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.163696,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
998,0.000000,0.166055,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.040924,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [45]:
TfIdf.loc[0].sort_values(ascending=False)[:10]

car           1.585043
60s/          0.863469
2-door        0.863469
tellme        0.863469
bricklin      0.776826
bumper        0.726143
funky         0.726143
enlighten     0.690183
production    0.620231
spec          0.603539
Name: 0, dtype: float64