In [1]:
import os
import numpy as np
import pandas as pd
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re 
from collections import Counter

In [2]:
def read_env(env_path):
    with open(env_path, 'r', encoding="utf-8") as env:
            env_settings = env.read()
            env_settings = re.sub(' ','',env_settings)
            env_settings = env_settings.split('\n')
            env_settings = [setting.split('=') for setting in env_settings]
            return dict(env_settings)

In [3]:
def settings(env_settings):
    language = env_settings['language']
    path = env_settings['path']
    type_files = env_settings['type_files']
    number_files= int(env_settings['number_files'])
    return language,path,type_files,number_files

In [4]:
language,path,type_files,number_files=settings(read_env('env.txt'))

In [5]:
def fileNames(path,type_files):
    file_names = [name for name in os.listdir(path) if name.endswith(type_files)]
    file_dirs=[]
    for name in file_names:
        file_dirs.append(path + '/' +name)
    return file_dirs,file_names

In [6]:
def Stemming(tokins):
    stemmer = PorterStemmer() 
    reviews_stem = [] 
    reviews_stem = [stemmer.stem(word) for word in tokins]
    return reviews_stem 

In [7]:
def tokenize(query): # get text and return list of tokens withot stopwords and lowercase 
    stopword = stopwords.words(language) #get list of stop words in language
    tokens = word_tokenize(query,language=language)  #get list of tokens
    tokens_without_stop_word = [word.lower() for word in tokens if not word in stopword] # remove stopwords
    return  tokens_without_stop_word

In [8]:
def get_ls_steming_tokins():
    inquery = input("Please enter a query:\n")
    query_to = tokenize(inquery)
    query =Stemming(query_to)
    return query,query_to

In [9]:
 file_dirs,file_names =fileNames(path,type_files)

In [10]:
def tf_dict():
    doc_vocab  = dict()
   
    for i in range(0,len(file_dirs)):
        doc_vocab[i] = dict()
        with open(file_dirs[i], 'r', encoding="utf-8") as doc:
            read_string = doc.read()  
            tokens = tokenize(read_string)
            tokens = Stemming(tokens)
            # get dict of {doc_num : {word1 : word1_count, word2 : word2_count, .... }}
            for words in tokens:
                if words in doc_vocab[i]:
                    doc_vocab[i][words] += 1
                else:
                    doc_vocab[i][words] = 1

    term_pd = pd.DataFrame.from_dict(doc_vocab, orient='index')
    term_pd.fillna(0,inplace=True)
    term_pd.sort_index(inplace=True)
    term_pd.to_csv('tf.csv')
    return term_pd

In [11]:
term_tf=tf_dict()

In [12]:
df_ls=term_tf[term_tf > 0].count()
idf=np.log(number_files/df_ls.values)
idf_dict=pd.Series(data=idf,index=df_ls.index).to_dict()
tf_idf=term_tf.copy()

In [13]:
for term in term_tf:
    for doc in range(0,len(term_tf)):
        if term_tf[term][doc] == 0:
            tf_idf[term][doc] = 0
        else:
            tf_idf[term][doc] = (1 + np.log(term_tf[term][doc])) * np.log10(number_files/df_ls[term])

In [14]:
tf_idf.to_csv('tf_idf2.csv')


In [15]:
lenth_docs=[math.sqrt(sum((tf_idf**2).loc[i]) ) for i in tf_idf.index]

In [18]:
normalize_data=tf_idf.copy()
for term in tf_idf:
    for doc in tf_idf.index:
        if lenth_docs[doc]==0:
            normalize_data[term][doc]=0
        else:
            normalize_data[term][doc]=tf_idf[term][doc] / lenth_docs[doc]

In [19]:
query,query_to=get_ls_steming_tokins()
tf_query=Counter(query)


In [23]:
tf_idf_query=dict()
for term in tf_query:
    if term in idf_dict:
        tf_idf_query[term]=(1 + np.log(tf_query[term])) * idf_dict[term]
    else:
        tf_idf_query[term]= 0


In [24]:
lenth_query=0
for i in tf_idf_query.values():
    lenth_query=lenth_query+(i**2)
lenth_query=math.sqrt(lenth_query)

    

In [25]:
normalize_query=tf_idf_query.copy()
for term in tf_idf_query:
    if lenth_query !=0:
        normalize_query[term]=  tf_idf_query[term] / lenth_query
    else:
        normalize_query[term]=0

In [28]:
sim=normalize_data.copy()
for term in normalize_data:
    if term not in normalize_query:
        sim.drop(term,inplace=True,axis=1)


In [30]:
for term in normalize_query:
    for doc in normalize_data.index:
        sim[term][doc]=normalize_data[term][doc] * normalize_query[term]

In [42]:
for i in range(0,len(sim.index)):
    print('sim of',file_names[i], '->', "%.2f" % sum(sim.loc[i]))

sim of doc1.txt -> 0.45
sim of doc10.txt -> 0.29
sim of doc2.txt -> 0.03
sim of doc3.txt -> 0.00
sim of doc4.txt -> 0.00
sim of doc5.txt -> 0.00
sim of doc6.txt -> 0.29
sim of doc7.txt -> 0.03
sim of doc8.txt -> 0.29
sim of doc9.txt -> 0.03
