# Vector space model

### We tokenize and remove punctuation form the extracted text for further processing

In [11]:
import os
import spacy 
from spacy.lang.am import Amharic
from collections import Counter
import pandas as pd

nlp = Amharic()
nlp.max_length = 20000000

data_path = r'C:\Users\user\Documents\Data_science\IR real\Demo_data\Updated_Demo'

books = os.listdir(data_path)


token_file = {}

for book in books:
    with open(os.path.join(data_path, book), 'r', encoding='utf-8') as file:
        for line in file:
            tokens = []
            line = line.strip()
            doc = nlp(line)
            tokens.append([token.text for token in doc if not token.is_punct])
            for token in tokens:
                for word in token:
                    if not word.isnumeric() and not word.isspace():
                        if word not in token_file:
                            token_file[word] = []
                        token_file[word].append(book)

In [20]:
import os
data_path = r'C:\Users\user\Documents\Data_science\IR real\Demo_data\Updated_Demo'

books = os.listdir(data_path)

In [21]:
words = token_file.keys()
documents = token_file.values()

Ifile = pd.DataFrame({"Term" : words, "Doc" : documents})
Ifile[170:175]

Unnamed: 0,Term,Doc
170,ጠቢብ,"[library004c.txt, library004c.txt, library004c..."
171,እነዚህን,"[library004c.txt, library0142.txt, library0142..."
172,ከመስማት,"[library004c.txt, library004c.txt, library0576..."
173,ጥበብን,"[library004c.txt, library004c.txt, library004c..."
174,ይጨምራል,"[library004c.txt, library004c.txt, library004c..."


In [22]:
Ifile['CF'] = Ifile['Doc'].apply(len)
Ifile['DF'] = Ifile['Doc'].apply(lambda x: len(set(x)))
Ifile[170:175]

Unnamed: 0,Term,Doc,CF,DF
170,ጠቢብ,"[library004c.txt, library004c.txt, library004c...",45,12
171,እነዚህን,"[library004c.txt, library0142.txt, library0142...",602,82
172,ከመስማት,"[library004c.txt, library004c.txt, library0576...",27,19
173,ጥበብን,"[library004c.txt, library004c.txt, library004c...",72,34
174,ይጨምራል,"[library004c.txt, library004c.txt, library004c...",64,25


### The above code is clearly explained in the `inverted index.ipynb` file

### The below code is used to calcualte the term frequency in each document.

In [23]:
def Tf_counter(docs):
    my_list = {}
    for doc in docs:
        if doc not in my_list:
            my_list[doc] = 1
        else:
            my_list[doc] += 1
    return my_list

In [24]:
Ifile['Doc_freq'] = Ifile['Doc'].apply(lambda x: Tf_counter(x))
Ifile[170:175]

Unnamed: 0,Term,Doc,CF,DF,Doc_freq
170,ጠቢብ,"[library004c.txt, library004c.txt, library004c...",45,12,"{'library004c.txt': 30, 'library064c.txt': 1, ..."
171,እነዚህን,"[library004c.txt, library0142.txt, library0142...",602,82,"{'library004c.txt': 1, 'library0142.txt': 6, '..."
172,ከመስማት,"[library004c.txt, library004c.txt, library0576...",27,19,"{'library004c.txt': 2, 'library0576.txt': 2, '..."
173,ጥበብን,"[library004c.txt, library004c.txt, library004c...",72,34,"{'library004c.txt': 17, 'library0576.txt': 1, ..."
174,ይጨምራል,"[library004c.txt, library004c.txt, library004c...",64,25,"{'library004c.txt': 3, 'library229f.txt': 1, '..."


### After calculating the term frequency we will have to change each documents to columns for easier manipulation.

In [25]:
Ifile = pd.concat([Ifile.drop('Doc_freq', axis =1), Ifile['Doc_freq'].apply(pd.Series)], axis = 1)
Ifile[170:175]

Unnamed: 0,Term,Doc,CF,DF,library004c.txt,library0576.txt,library357b.txt,library65b7.txt,libraryc1ca.txt,libraryce63.txt,...,libraryd575.txt,librarye70e.txt,library9303.txt,libraryce72.txt,librarye6a8.txt,library8f71.txt,librarya9d2.txt,libraryc17f.txt,library4d03.txt,library3cdb.txt
170,ጠቢብ,"[library004c.txt, library004c.txt, library004c...",45,12,30.0,,,1.0,,,...,,,,,,,,,,
171,እነዚህን,"[library004c.txt, library0142.txt, library0142...",602,82,1.0,1.0,,8.0,,,...,19.0,,,,,,3.0,7.0,,
172,ከመስማት,"[library004c.txt, library004c.txt, library0576...",27,19,2.0,2.0,,3.0,,,...,,,,,,,,,,
173,ጥበብን,"[library004c.txt, library004c.txt, library004c...",72,34,17.0,1.0,,,,2.0,...,,,,,,,,1.0,,
174,ይጨምራል,"[library004c.txt, library004c.txt, library004c...",64,25,3.0,,,,,,...,,2.0,,,,,,,,


In [26]:
Ifile.fillna(0, inplace= True)

### The below code calculates the `IDF`

In [27]:
import numpy as np
def IDF_calc(DF):
    return np.round(np.log10(len(books)/DF), 3)

In [28]:
Ifile['IDF'] = Ifile['DF'].apply(lambda x: IDF_calc(x))
Ifile[170:175]

Unnamed: 0,Term,Doc,CF,DF,library004c.txt,library0576.txt,library357b.txt,library65b7.txt,libraryc1ca.txt,libraryce63.txt,...,librarye70e.txt,library9303.txt,libraryce72.txt,librarye6a8.txt,library8f71.txt,librarya9d2.txt,libraryc17f.txt,library4d03.txt,library3cdb.txt,IDF
170,ጠቢብ,"[library004c.txt, library004c.txt, library004c...",45,12,30.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.021
171,እነዚህን,"[library004c.txt, library0142.txt, library0142...",602,82,1.0,1.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,7.0,0.0,0.0,0.187
172,ከመስማት,"[library004c.txt, library004c.txt, library0576...",27,19,2.0,2.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.822
173,ጥበብን,"[library004c.txt, library004c.txt, library004c...",72,34,17.0,1.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.569
174,ይጨምራል,"[library004c.txt, library004c.txt, library004c...",64,25,3.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.702


### Then we calculate the  `tf * idf` so that we know the weight

In [29]:
for book in books:
    Ifile[book] = Ifile[book] * Ifile['IDF']
Ifile[170:175]

Unnamed: 0,Term,Doc,CF,DF,library004c.txt,library0576.txt,library357b.txt,library65b7.txt,libraryc1ca.txt,libraryce63.txt,...,librarye70e.txt,library9303.txt,libraryce72.txt,librarye6a8.txt,library8f71.txt,librarya9d2.txt,libraryc17f.txt,library4d03.txt,library3cdb.txt,IDF
170,ጠቢብ,"[library004c.txt, library004c.txt, library004c...",45,12,30.63,0.0,0.0,1.021,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.021
171,እነዚህን,"[library004c.txt, library0142.txt, library0142...",602,82,0.187,0.187,0.0,1.496,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.561,1.309,0.0,0.0,0.187
172,ከመስማት,"[library004c.txt, library004c.txt, library0576...",27,19,1.644,1.644,0.0,2.466,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.822
173,ጥበብን,"[library004c.txt, library004c.txt, library004c...",72,34,9.673,0.569,0.0,0.0,0.0,1.138,...,0.0,0.0,0.0,0.0,0.0,0.0,0.569,0.0,0.0,0.569
174,ይጨምራል,"[library004c.txt, library004c.txt, library004c...",64,25,2.106,0.0,0.0,0.0,0.0,0.0,...,1.404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.702


In [30]:
import pandas as pd
Ifile = pd.read_csv('IDF_ized.csv')
Ifile[170:172]

Unnamed: 0.1,Unnamed: 0,Term,Doc,CF,DF,library004c.txt,library0576.txt,library357b.txt,library65b7.txt,libraryc1ca.txt,...,librarye70e.txt,library9303.txt,libraryce72.txt,librarye6a8.txt,library8f71.txt,librarya9d2.txt,libraryc17f.txt,library4d03.txt,library3cdb.txt,IDF
170,170,ጠቢብ,"['library004c.txt', 'library004c.txt', 'librar...",45,12,30.63,0.0,0.0,1.021,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.021
171,171,እነዚህን,"['library004c.txt', 'library0142.txt', 'librar...",602,82,0.187,0.187,0.0,1.496,0.0,...,0.0,0.0,0.0,0.0,0.0,0.561,1.309,0.0,0.0,0.187


In [31]:
Ifile.drop('Unnamed: 0', axis = 1, inplace= True)

## The query processing part

#### This part will calculate the vector length for each book

In [32]:
import numpy as np
book_norm = {}
for book in books:
    book_value = Ifile[book].values
    vector_length = np.round(np.linalg.norm(book_value), 3)
    book_norm[book] = vector_length

#### The below function will prepare the query the same way as inverted index is prepared

In [33]:
import os
from spacy.lang.am import Amharic
from collections import Counter
nlp = Amharic()
nlp.max_length = 20000000
tokens = []

def query(text):
    
    text = text.strip()
    doc = nlp(text)
    tokens.extend([token.text for token in doc if not token.is_punct])

    
    Doc_freq = []
    book_list = []
    trial = {}
    IDF =[]
    query_df = None
    
    for token in tokens:
        word_freq = {}
        for book in books:
            document = os.path.join(data_path, book)
            with open(document, 'r', encoding='utf-8') as file:
                content = file.read()
                word_count= Counter(content.split())
                word_freq[book] = word_count[token]
                
        idf = float(Ifile['IDF'].loc[Ifile['Term'] == token])
        IDF.append(idf)
        
        Doc_freq.append(word_freq)

    query_df = pd.DataFrame({'Term' : tokens, "Doc_freq" : Doc_freq, "IDF" : IDF})
    
    return query_df

#### Let's demonstrate how it works

In [34]:
text = "ባሕር ተሻግሮ ሕይወትን መሸጥ"
query_df = (query(text))
query_df

  idf = float(Ifile['IDF'].loc[Ifile['Term'] == token])


Unnamed: 0,Term,Doc_freq,IDF
0,ባሕር,"{'library004c.txt': 0, 'library0142.txt': 0, '...",0.337
1,ተሻግሮ,"{'library004c.txt': 0, 'library0142.txt': 3, '...",0.799
2,ሕይወትን,"{'library004c.txt': 1, 'library0142.txt': 0, '...",0.393
3,መሸጥ,"{'library004c.txt': 0, 'library0142.txt': 0, '...",0.87


#### Right here `Q` is the sum of frequency for each word

In [35]:
query_df['Q'] = query_df['Doc_freq'].apply(lambda x: np.sum(list(x.values())))

#### Right here `Q` is the sum of frequency for each word multiplied by the `IDF` therefore it represents the weight

In [37]:
query_df['Q'] = query_df['Q'] * query_df['IDF']
query_df.drop(['Doc_freq', 'IDF'],axis =1, inplace=True)
query_df

Unnamed: 0,Term,Q
0,ባሕር,158.053
1,ተሻግሮ,39.151
2,ሕይወትን,66.024
3,መሸጥ,18.27


##### This will list only the words that are present in the query and inverted index file. It will save it to `Ifile`

In [43]:
Ifile = Ifile.loc[Ifile['Term'].isin(tokens)]

In [44]:
Ifile = Ifile[['Term'] + books]

##### We will merge the query dataframe and the words from the inverted index file to `QI_df`

In [45]:
QI_df = query_df.merge(Ifile, on='Term')

##### Now by multiplying `QI_df[book]` with `QI_df['Q']` we prepare the columns for the dot product. Therefore when we add the columns we will have the dot product.

In [47]:
for book in books:
    QI_df[book] = QI_df[book] * QI_df['Q']

In [48]:
QI_df

Unnamed: 0,Term,Q,library004c.txt,library0142.txt,library03a0.txt,library0576.txt,library064c.txt,library0a1c.txt,library0c15.txt,library0f35.txt,...,librarye6a8.txt,librarye70e.txt,librarye77f.txt,librarye783.txt,librarye94f.txt,libraryeb1a.txt,libraryf48b.txt,libraryf49f.txt,libraryf633.txt,libraryfc8f.txt
0,ባሕር,158.053,0.0,0.0,1864.235135,106.527722,53.263861,0.0,0.0,106.527722,...,0.0,53.263861,53.263861,0.0,0.0,53.263861,0.0,1065.27722,0.0,745.694054
1,ተሻግሮ,39.151,0.0,93.844947,62.563298,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ሕይወትን,66.024,25.947432,0.0,77.842296,0.0,51.894864,25.947432,0.0,0.0,...,25.947432,0.0,0.0,0.0,0.0,0.0,25.947432,493.001208,51.894864,0.0
3,መሸጥ,18.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,15.8949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Vector length of  `Q`

In [52]:
query_value = QI_df[book].values
vector_length = np.round(np.linalg.norm(query_value), 3)


##### We compute similarity score and sort them in descending value

In [53]:
vsm_result = {}
for book in books:
    result = QI_df[book].sum()
    result = result / (vector_length * book_norm[book])
    vsm_result[book] = result

vsm_result = pd.Series(vsm_result, name = 'result')
vsm_result = pd.DataFrame(vsm_result)
vsm_result.sort_values(by='result', ascending= False, inplace=True)

In [54]:
vsm_result

Unnamed: 0,result
librarydbb1.txt,0.009428
library5e11.txt,0.008213
library03a0.txt,0.005592
library889a.txt,0.003175
libraryf49f.txt,0.002937
...,...
library56d0.txt,0.000000
libraryc5bb.txt,0.000000
libraryc630.txt,0.000000
library4d03.txt,0.000000
