# Model documents in the Vector Space

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import pymongo

In [3]:
from collections import Counter, defaultdict

## Get data

In [4]:
db = pymongo.MongoClient()['wikibio']['rawdata']
data = list(db.find({
    'subdata': 'train', 'box.occupation': {'$exists': True}
}))

In [5]:
len(data)

151709

In [6]:
" ".join(data[100]['sentences'])

'louis-guillaume perreaux -lrb- 19 february 1816 -- 5 april 1889 -rrb- was a french inventor and engineer who submitted one of the first patents for a working motorcycle in 1869 .'

In [7]:
data = data[:10000]

### Indexing occupations and selection of documents

In [8]:
occupations = defaultdict(set)
for i, record in enumerate(data):
    occupation = record['box']['occupation']
    o = " ".join(occupation)
    for p in o.split(' , '):
        occupations[p].add(record['id'])

In [9]:
occupation_stats = pd.Series(dict([(k, len(v)) for k, v in occupations.items()]))
selected_occupations = occupation_stats.sort_values(ascending=False).head(50)

In [10]:
selected_documents = set()
for k in selected_occupations.keys():
    selected_documents = selected_documents.union(occupations[k])

In [11]:
docs = [" ".join(x['sentences']) for x in data if x['id'] in selected_documents]
docids = [x['id'] for x in data if x['id'] in selected_documents]

In [12]:
docs[0]

'linda hayden -lrb- born 19 january 1953 -rrb- is an english film and television actress and the sister of actress jane hayden . she is best known for her roles in 1970s british horror films and sex comedies .'

## Term Frequency

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
tf_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), min_df=20)
X = tf_vectorizer.fit_transform(docs)

In [15]:
words = tf_vectorizer.get_feature_names_out()

In [16]:
TF_count = pd.DataFrame(X.toarray(), columns=words)

In [17]:
TF_count.head()

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,yet,york,you,young,younger,youngest,your,youth,youtube,zealand
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
TF_count.shape

(6316, 2208)

In [25]:
vocabulary = TF_count.columns

In [26]:
vocabulary

Index(['000', '10', '100', '11', '12', '13', '14', '15', '16', '17',
       ...
       'yet', 'york', 'you', 'young', 'younger', 'youngest', 'your', 'youth',
       'youtube', 'zealand'],
      dtype='object', length=2208)

In [27]:
TF_count['young'].sort_values(ascending=False)

5916    5
2837    4
2722    4
5727    4
3299    3
       ..
2141    0
2140    0
2139    0
2138    0
6315    0
Name: young, Length: 6316, dtype: int64

In [28]:
docs[5916]

"faron young -lrb- february 25 , 1932 -- december 10 , 1996 -rrb- was an american country music singer and songwriter from the early 1950s into the mid-1980s and one of its most successful and colorful stars . hits including `` if you ai n't lovin ' -lrb- you ai n't livin ' -rrb- '' and `` live fast , love hard , die young '' marked him as a honky-tonk singer in sound and personal style ; and his chart-topping singles `` hello walls '' and `` it 's four in the morning '' showed his versatility as a vocalist . known as the hillbilly heartthrob , and following a movie role , the young sheriff , young 's singles reliably charted for more than 30 years . he committed suicide in 1996 . young is a member of the country music hall of fame ."

In [29]:
docs[2837]

"paris smith is an american actress and singer commonly known for playing maddie van pelt in the nickelodeon show `` every witch way '' . other appearances include `` r.a.d.i.c.a.l.s '' , `` totally '' , `` the bedtime story '' , and others . she has also appeared on many episodes of the fine brothers ' youtube show `` kids react '' -lrb- e.g. , `` kids react to going to the store ! '' -rrb- . smith won the best performance in a short film - young actress 11-12 award at young artist award 2013 , and was nominated in 2014 for the best performance in a tv series - guest starring young actress 11-13 award for `` modern family '' , and was nominated in 2015 for the best performance in a tv series - young actress 11-15 award for `` every witch way '' ."

In [30]:
d1 = TF_count.loc[5916]
d2 = TF_count.loc[2837]

In [38]:
np.sqrt(np.power(d1 - d2, 2).sum())

16.0

### Search engine

In [39]:
np.sqrt(np.power(d1 - d2, 2).sum())

16.0

In [40]:
q = np.zeros(len(vocabulary))
query = 'american country music singer and country songwriter'
for word in query.split():
    q[list(vocabulary).index(word)] += 1

In [41]:
q

array([0., 0., 0., ..., 0., 0., 0.])

In [43]:
def euclidean(x, y):
    return np.sqrt(np.power(x - y, 2).sum())

In [44]:
distances = {}
for i, row in TF_count.iterrows():
    e = euclidean(q, row)
    distances[i] = e

In [45]:
E = pd.Series(distances)

In [46]:
E.sort_values(ascending=True)

1496     2.236068
2479     2.645751
5319     3.000000
2375     3.162278
5487     3.162278
          ...    
5109    81.221918
1951    81.264999
917     85.017645
4839    90.553851
646     96.020831
Length: 6316, dtype: float64

In [52]:
E.loc[5916]

14.594519519326424

In [51]:
print(query, '\n')
print(docs[5487])

american country music singer and country songwriter 

candice parise is a french actress and singer .


### Most relevant words

In [53]:
TF_count.sum(axis=0).sort_values(ascending=False).head(10)

the    26213
and    18704
in     17526
of     14770
lrb     9372
rrb     9369
he      8150
is      7896
was     7368
for     6017
dtype: int64

### Biggest documents

In [54]:
doc_size = TF_count.sum(axis=1).sort_values(ascending=False)
doc_size

4839    719
646     609
917     599
4246    567
5109    560
       ... 
187       3
1972      3
2623      3
4969      3
796       2
Length: 6316, dtype: int64

In [55]:
docs[719]

"benjamin `` ben '' greaves-neal is an english child actor . his career dates back to 2010 when he made a guest appearance in the award winning bbc sitcom my family . since then ben has appeared in 2011 's horror the awakening and donald rice 's period drama cheerful weather for the wedding . he is also known for his recurring role as cousin max in grandpa in my pocket . in 2013 , ben greaves-neal made a guest appearance in the bbc hospital soap casualty as a child suffering from a vitamin overdose . the episode was titled `` hidden '' . ben is best known for portraying oliver in the bbc black comedy being human -lrb- uk series 5 -rrb- . the role has earned him praise from critics alike in the uk and usa ."

In [56]:
print(doc_size.loc[5916], doc_size.loc[2837], doc_size.loc[2722])

98 104 236


### Most relevant words in the longest doc

In [57]:
TF_count.loc[719].sort_values(ascending=False).head(10)

in       8
the      8
ben      4
for      3
is       3
bbc      3
and      2
child    2
role     2
known    2
Name: 719, dtype: int64

In [58]:
TF_count.loc[19].sort_values(ascending=False).head(10)

the       19
of        19
in        11
he        11
and        9
member     6
party      4
became     4
was        4
to         3
Name: 19, dtype: int64

### Normalize TF

In [59]:
TF_norm = (TF_count.T / TF_count.max(axis=1)).T

In [60]:
TF_norm.sum(axis=0).sort_values(ascending=False)

the        3819.030734
and        3561.419775
in         2790.438618
lrb        2558.040118
rrb        2556.848659
              ...     
broke         1.763347
join          1.718982
dvd           1.676859
article       1.620634
invited       1.554078
Length: 2208, dtype: float64

In [61]:
TF_norm.loc[719].sort_values(ascending=False).head(10)

in       1.000
the      1.000
ben      0.500
for      0.375
is       0.375
bbc      0.375
and      0.250
child    0.250
role     0.250
known    0.250
Name: 719, dtype: float64

In [62]:
TF_norm.loc[19].sort_values(ascending=False).head(10)

the       1.000000
of        1.000000
in        0.578947
he        0.578947
and       0.473684
member    0.315789
party     0.210526
became    0.210526
was       0.210526
to        0.157895
Name: 19, dtype: float64

$$
IDF = \log(\frac{N}{df})
$$

## Idf

In [63]:
IDF = np.log(len(docs) / TF_count[TF_count > 0].count(axis=0))

In [64]:
IDF.sort_values(ascending=False).head(10)

car          5.755109
violence     5.755109
already      5.755109
occasions    5.755109
ralph        5.755109
creation     5.755109
cousin       5.755109
versions     5.755109
nigerian     5.755109
something    5.755109
dtype: float64

## TfIdf

In [65]:
TFIDF = (TF_count * IDF)

In [66]:
TFIDF.loc[719].sort_values(ascending=False).head(10)

ben           20.349119
bbc           12.097028
appearance     8.640049
uk             8.156025
guest          8.118987
child          7.926699
my             7.440807
cousin         5.755109
made           5.729475
praise         5.572788
Name: 719, dtype: float64

In [68]:
TF_count.loc[5916].sort_values(ascending=False)

and          8
the          6
young        5
as           3
of           3
            ..
ed           0
economics    0
economic     0
eastern      0
zealand      0
Name: 5916, Length: 2208, dtype: int64

In [69]:
TFIDF.loc[5916].sort_values(ascending=False)

young        16.994916
singles       7.782058
you           7.562056
1996          7.253755
country       7.039466
               ...    
ed            0.000000
economics     0.000000
economic      0.000000
eastern       0.000000
zealand       0.000000
Name: 5916, Length: 2208, dtype: float64

In [70]:
distances_tfidf = {}
for i, row in TFIDF.iterrows():
    e = euclidean(q, row)
    distances_tfidf[i] = e

In [71]:
Etfidf = pd.Series(distances_tfidf)

In [72]:
Etfidf.sort_values(ascending=True)

2154      3.437160
2767      3.437160
1396      3.443755
3804      3.822365
2623      4.000653
           ...    
550     124.195935
1951    124.717325
5109    131.324340
646     137.637174
4839    142.854295
Length: 6316, dtype: float64

In [77]:
print(query, '\n')
print(docs[1396])

american country music singer and country songwriter 

christine evangelista in an american actress .


In [78]:
Etfidf.loc[5916]

33.93826816469266