# **Imports**

In [236]:
import os 
import numpy as np
import pandas as pd
import string

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

import nltk

from nltk.corpus import webtext
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 

# nltk.download('stopwords')
# nltk.download('webtext')
# nltk.download('punkt')

# **Reading Files**

In [237]:
Text1 = webtext.raw(r"C:\Users\ahmed\Desktop\DocumentCollection\Documents\1.txt")
Text1

'antony brutus caeser cleopatra mercy worser'

In [238]:
Files = os.listdir(r"C:\Users\ahmed\Desktop\DocumentCollection\Documents")

Path_To_Files = r"C:\Users\ahmed\Desktop\DocumentCollection\Documents"

Text_Files = {}

for i,file in enumerate(Files):  

    Text_Files['doc' + str(i+1)] = webtext.raw( os.path.join(Path_To_Files ,str(i+1)+'.txt') )

Text_Files

{'doc1': 'antony brutus caeser cleopatra mercy worser',
 'doc2': 'antony brutus caeser calpurnia ',
 'doc3': 'mercy worser',
 'doc4': 'brutus caeser mercy worser',
 'doc5': 'caeser mercy worser',
 'doc6': 'antony caeser mercy ',
 'doc7': 'angels fools fear in rush to tread where',
 'doc8': 'angels fools fear in rush to tread where',
 'doc9': 'angels fools in rush to tread where',
 'doc10': 'fools fear in rush to tread where'}

In [239]:
Files = os.listdir(r"C:\Users\ahmed\Desktop\DocumentCollection\Documents")

Path_To_Files = r"C:\Users\ahmed\Desktop\DocumentCollection\Documents"

All_Text = []

for i,file in enumerate(Files):

    All_Text.append( webtext.raw( os.path.join(Path_To_Files ,str(i+1)+'.txt') ) )


All_Text

['antony brutus caeser cleopatra mercy worser',
 'antony brutus caeser calpurnia ',
 'mercy worser',
 'brutus caeser mercy worser',
 'caeser mercy worser',
 'antony caeser mercy ',
 'angels fools fear in rush to tread where',
 'angels fools fear in rush to tread where',
 'angels fools in rush to tread where',
 'fools fear in rush to tread where']

# **Word Tokenization**

In [240]:
print(word_tokenize(Text1))

['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser']


In [241]:
All_Tokens = []

for file in Text_Files.keys():
    All_Tokens.append(word_tokenize(Text_Files[file]))

All_Tokens

[['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser'],
 ['antony', 'brutus', 'caeser', 'calpurnia'],
 ['mercy', 'worser'],
 ['brutus', 'caeser', 'mercy', 'worser'],
 ['caeser', 'mercy', 'worser'],
 ['antony', 'caeser', 'mercy'],
 ['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['angels', 'fools', 'fear', 'in', 'rush', 'to', 'tread', 'where'],
 ['angels', 'fools', 'in', 'rush', 'to', 'tread', 'where'],
 ['fools', 'fear', 'in', 'rush', 'to', 'tread', 'where']]

In [242]:
All_Tokens = np.concatenate(All_Tokens)
All_Tokens

array(['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser',
       'antony', 'brutus', 'caeser', 'calpurnia', 'mercy', 'worser',
       'brutus', 'caeser', 'mercy', 'worser', 'caeser', 'mercy', 'worser',
       'antony', 'caeser', 'mercy', 'angels', 'fools', 'fear', 'in',
       'rush', 'to', 'tread', 'where', 'angels', 'fools', 'fear', 'in',
       'rush', 'to', 'tread', 'where', 'angels', 'fools', 'in', 'rush',
       'to', 'tread', 'where', 'fools', 'fear', 'in', 'rush', 'to',
       'tread', 'where'], dtype='<U9')

# **Removing Stop Words**

In [243]:
Stop_Words = set( stopwords.words('english') )

len(Stop_Words)

179

**Excluding `['in','to','where']`**

In [244]:
Stop_Words_Excluded = Stop_Words.difference(['in','to','where'])

len(Stop_Words_Excluded)

176

In [245]:
len(All_Tokens)

52

In [246]:
Clean_Tokens = [word for word in All_Tokens if word not in Stop_Words_Excluded]

Clean_Tokens

['antony',
 'brutus',
 'caeser',
 'cleopatra',
 'mercy',
 'worser',
 'antony',
 'brutus',
 'caeser',
 'calpurnia',
 'mercy',
 'worser',
 'brutus',
 'caeser',
 'mercy',
 'worser',
 'caeser',
 'mercy',
 'worser',
 'antony',
 'caeser',
 'mercy',
 'angels',
 'fools',
 'fear',
 'in',
 'rush',
 'to',
 'tread',
 'where',
 'angels',
 'fools',
 'fear',
 'in',
 'rush',
 'to',
 'tread',
 'where',
 'angels',
 'fools',
 'in',
 'rush',
 'to',
 'tread',
 'where',
 'fools',
 'fear',
 'in',
 'rush',
 'to',
 'tread',
 'where']

In [247]:
len(Clean_Tokens)

52

In [248]:
len(set(Clean_Tokens))

15

In [249]:
Final_Tokens = set(Clean_Tokens)
Final_Tokens

{'angels',
 'antony',
 'brutus',
 'caeser',
 'calpurnia',
 'cleopatra',
 'fear',
 'fools',
 'in',
 'mercy',
 'rush',
 'to',
 'tread',
 'where',
 'worser'}

In [250]:
def RemovePunctuation(text):

    if type(text) == list:
        cleanText = [sentence.translate(str.maketrans('', '', string.punctuation)) for sentence in text]

    else:
        cleanText = text.translate(str.maketrans('', '', string.punctuation))

    return cleanText

In [251]:
def Preprocess_Text(text):
    """
    return passed text : Tokenized, Lowercased, Stopwords and Punctuation removed.
    """
    
    # Remove Punctuation
    cleanText = RemovePunctuation(text)

    # tokenize
    textTokens = word_tokenize(cleanText)
    
    # remove stop words , lowercase words
    cleanTextTokens = [word.lower() for word in textTokens if word not in Stop_Words_Excluded]

    return cleanTextTokens

In [252]:
Preprocess_Text(All_Text[0])

['antony', 'brutus', 'caeser', 'cleopatra', 'mercy', 'worser']

# ---------------------------------------------------------------------------------

# **Making Postional Indexes**

In [253]:
Text_Files

{'doc1': 'antony brutus caeser cleopatra mercy worser',
 'doc2': 'antony brutus caeser calpurnia ',
 'doc3': 'mercy worser',
 'doc4': 'brutus caeser mercy worser',
 'doc5': 'caeser mercy worser',
 'doc6': 'antony caeser mercy ',
 'doc7': 'angels fools fear in rush to tread where',
 'doc8': 'angels fools fear in rush to tread where',
 'doc9': 'angels fools in rush to tread where',
 'doc10': 'fools fear in rush to tread where'}

In [254]:
All_Text

['antony brutus caeser cleopatra mercy worser',
 'antony brutus caeser calpurnia ',
 'mercy worser',
 'brutus caeser mercy worser',
 'caeser mercy worser',
 'antony caeser mercy ',
 'angels fools fear in rush to tread where',
 'angels fools fear in rush to tread where',
 'angels fools in rush to tread where',
 'fools fear in rush to tread where']

**Generating Postings**

In [255]:
Postings_DF = pd.DataFrame()

frequency = pd.DataFrame()

docNum = 1

for text in All_Text:
    
    tokens = Preprocess_Text(text)
    
    pos = 0
    
    for token in tokens:
        
        if token in Postings_DF:
            
            PosList = Postings_DF[token][0]

            PosList_docNums = [docNums[0] for docNums in PosList]
            
            if docNum in PosList_docNums: # word occurs more than once in the document
                
                for docNums in PosList:
                    
                    if docNums[0] == docNum:
                        
                        docNums[1].add(pos)
            else:
                PosList.append([docNum,{pos}]) # insert first posting
                
                frequency[token][0] += 1
        else:
            Postings_DF.insert(value= [[[docNum, {pos}]]] , loc=0, column=token) # insert all tokens first
            
            frequency.insert(value=[1], loc=0, column=token)

        pos += 1
    docNum += 1

In [256]:
Postings_DF = Postings_DF.rename({0:'Postings'})
Postings_DF.T

Unnamed: 0,Postings
where,"[[7, {7}], [8, {7}], [9, {6}], [10, {6}]]"
tread,"[[7, {6}], [8, {6}], [9, {5}], [10, {5}]]"
to,"[[7, {5}], [8, {5}], [9, {4}], [10, {4}]]"
rush,"[[7, {4}], [8, {4}], [9, {3}], [10, {3}]]"
in,"[[7, {3}], [8, {3}], [9, {2}], [10, {2}]]"
fear,"[[7, {2}], [8, {2}], [10, {1}]]"
fools,"[[7, {1}], [8, {1}], [9, {1}], [10, {0}]]"
angels,"[[7, {0}], [8, {0}], [9, {0}]]"
calpurnia,"[[2, {3}]]"
worser,"[[1, {5}], [3, {1}], [4, {3}], [5, {2}]]"


In [257]:
frequency = frequency.rename({0:'Frequency'})
frequency.T

Unnamed: 0,Frequency
where,4
tread,4
to,4
rush,4
in,4
fear,3
fools,4
angels,3
calpurnia,1
worser,4


# **Postional Index Queries**

In [258]:
def Get_Matched_DocNums_WordsPostions(posting_values, doc):
    """
    Checks if passed postings docNumbers = passed docNumbers

    returns passed postings (postions) that satisfy the condition.
    """

    for posting_value in posting_values:
        
        if posting_value[0] == doc:
            
            return posting_value[1]

    return []

In [259]:
def Intial_Matchings_Postings(word):
    """
    return list containing passed first query token : [docNumber , postions]
    """

    firstPostings = []

    word_postings = Postings_DF[word][0]

    for word_posting in word_postings:

        for positions in word_posting[1]:

            firstPostings.append((word_posting[0], positions))
            
    return firstPostings

In [260]:
def Match_Positional_Index(First_Postings, RemainQuery):
    """
    Check if First_Postings token postion comes after RemainQuery tokens postion in Postings_DF. (same for remaining tokens in RemainQuery)

    return list containing Macthed Documents Numbers
    """

    Documents_Matched = []
    
    for postings in First_Postings: # [docNumber , postions]

        First_Postings_docNum = postings[0] # docNumber
        
        First_Postings_Postion = postings[1] # postion

        matchedCounts = 0

        for remainToken in RemainQuery:

            First_Postings_Postion = First_Postings_Postion + 1 # to check if next token comes after this postion

            remainToken_Postings = Postings_DF[remainToken][0] # get remain token postings

            remainToken_DocNums = [ docNums[0] for docNums in remainToken_Postings ] # get remain token docNums


            if First_Postings_docNum in remainToken_DocNums: 

                Matched_DocNums_WordsPostions = Get_Matched_DocNums_WordsPostions( remainToken_Postings, First_Postings_docNum )

                if First_Postings_Postion in Matched_DocNums_WordsPostions:

                    matchedCounts += 1
                    
                else:
                    
                    break


            if matchedCounts == len(RemainQuery): # if it matches all query tokens

                Documents_Matched.append( First_Postings_docNum ) # append docNum from first postings
                
    
    return set(Documents_Matched)

In [261]:
def get_Postional_Matches(query):
    """
    Preprocess query , Return query tokens matched document files.
    """

    query_tokens = Preprocess_Text(query)

    print( "Query Tokens :" , query_tokens)
    
    # Query Only One Word
    
    if len(query_tokens)==1:

        MatchedDocs = [ docNums[0] for docNums in Postings_DF[ query_tokens[0] ][0] ]

        MatchedDocs = ['doc'+str(x) for x in MatchedDocs]

        print("Matched Document Files :", MatchedDocs); print('\n')
    
        return MatchedDocs

    
    # Get Fisrt Postions
    First_Postings = Intial_Matchings_Postings( query_tokens[0] )

    RemainQuery = query_tokens[1:]

    # Get Matched Documnets
    
    MatchedDocs = Match_Positional_Index(First_Postings, RemainQuery)

    # Handling Output

    MatchedDocs = ['doc'+str(x) for x in MatchedDocs]

    print("Matched Document Files :", MatchedDocs); print('\n')
    
    return MatchedDocs

In [262]:
def Print_ResultFiles(results):

    for res in results:
        print( res + " : " + Text_Files[res])

In [263]:
query = 'antony brutus'

Postional_results = get_Postional_Matches(query)

Query Tokens : ['antony', 'brutus']
Matched Document Files : ['doc1', 'doc2']




In [264]:
print('Postional Index Matched Documents: ')

Print_ResultFiles(Postional_results)

Postional Index Matched Documents: 
doc1 : antony brutus caeser cleopatra mercy worser
doc2 : antony brutus caeser calpurnia 


# ---------------------------------------------------------------------------------

# **Documents Vectorizing**

## **TF and w tf(1+ log tf)**

### **TF**

In [265]:
All_Text

['antony brutus caeser cleopatra mercy worser',
 'antony brutus caeser calpurnia ',
 'mercy worser',
 'brutus caeser mercy worser',
 'caeser mercy worser',
 'antony caeser mercy ',
 'angels fools fear in rush to tread where',
 'angels fools fear in rush to tread where',
 'angels fools in rush to tread where',
 'fools fear in rush to tread where']

In [266]:
Tokens = ['antony','brutus','caeser','calpurnia','cleopatra','mercy','worser',
          'angels','fools','fear','in','rush','to','tread','where']

In [267]:
Temp_TF = {token : 0 for token in Tokens}
Temp_TF

{'antony': 0,
 'brutus': 0,
 'caeser': 0,
 'calpurnia': 0,
 'cleopatra': 0,
 'mercy': 0,
 'worser': 0,
 'angels': 0,
 'fools': 0,
 'fear': 0,
 'in': 0,
 'rush': 0,
 'to': 0,
 'tread': 0,
 'where': 0}

In [268]:
def Calculate_TF(All_Text):
    
    TF = {}

    for i,doc in enumerate(All_Text):
        Temp_TF = {token : 0 for token in Tokens}

        for word in doc.split():

            if Temp_TF[word] == 0:
                
                Temp_TF[word] = 1
            else:

                Temp_TF[word] += 1

        TF['doc' +str(i+1)] =  Temp_TF
    
    return pd.DataFrame(TF)

In [269]:
TF = Calculate_TF(All_Text)

TF.style.background_gradient(cmap='PuBu',axis=1)

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
antony,1,1,0,0,0,1,0,0,0,0
brutus,1,1,0,1,0,0,0,0,0,0
caeser,1,1,0,1,1,1,0,0,0,0
calpurnia,0,1,0,0,0,0,0,0,0,0
cleopatra,1,0,0,0,0,0,0,0,0,0
mercy,1,0,1,1,1,1,0,0,0,0
worser,1,0,1,1,1,0,0,0,0,0
angels,0,0,0,0,0,0,1,1,1,0
fools,0,0,0,0,0,0,1,1,1,1
fear,0,0,0,0,0,0,1,1,0,1


### **w tf(1+ log tf)**

In [270]:
def Calculate_TF_WeightedLog(TF):

    TF_WightedLog_DF= TF.copy()

    for col in TF:
        TF_WightedLog_DF[col] = TF[col].apply(lambda x : 1 + np.log(x) if x!=0 else 0)

    return TF_WightedLog_DF

In [271]:
TF_WightedLog = Calculate_TF_WeightedLog(TF)

TF_WightedLog.style.background_gradient(cmap='PuBu',axis=1)

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
antony,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
brutus,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
caeser,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
calpurnia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mercy,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
worser,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
angels,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
fools,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
fear,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


## **DF , IDF and TF-IDF**

### **DF and IDF**

In [272]:
Tokens

['antony',
 'brutus',
 'caeser',
 'calpurnia',
 'cleopatra',
 'mercy',
 'worser',
 'angels',
 'fools',
 'fear',
 'in',
 'rush',
 'to',
 'tread',
 'where']

In [273]:
def Calculate_DF(All_Text):

    DF = {token : 0 for token in Tokens}

    for i,doc in enumerate(All_Text):    

        for word in doc.split():

            if DF[word] == 0:
                
                DF[word] = 1
            
            else:

                DF[word] += 1               
    
    return pd.DataFrame(pd.Series(DF), columns=['df'])

In [274]:
DF = Calculate_DF(All_Text)

DF.style.background_gradient(cmap='PuBu',axis=0)

Unnamed: 0,df
antony,3
brutus,3
caeser,5
calpurnia,1
cleopatra,1
mercy,5
worser,4
angels,3
fools,4
fear,3


In [275]:
def Calculate_IDF(DF, N=10):

    IDF = DF.copy()

    for col in IDF:
        
        IDF[col] = IDF[col].apply(lambda x : np.log10(N/x))   
            
    IDF.columns = ['idf']
    
    return IDF

In [276]:
IDF = Calculate_IDF(DF)

IDF.style.background_gradient(cmap='PuBu',axis=0)

Unnamed: 0,idf
antony,0.522879
brutus,0.522879
caeser,0.30103
calpurnia,1.0
cleopatra,1.0
mercy,0.30103
worser,0.39794
angels,0.522879
fools,0.39794
fear,0.522879


### **TF-IDF**

In [277]:
TF_IDF = TF_WightedLog.multiply(IDF['idf'], axis=0)

TF_IDF.style.background_gradient(cmap='PuBu',axis=1)

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
antony,0.522879,0.522879,0.0,0.0,0.0,0.522879,0.0,0.0,0.0,0.0
brutus,0.522879,0.522879,0.0,0.522879,0.0,0.0,0.0,0.0,0.0,0.0
caeser,0.30103,0.30103,0.0,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
calpurnia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cleopatra,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mercy,0.30103,0.0,0.30103,0.30103,0.30103,0.30103,0.0,0.0,0.0,0.0
worser,0.39794,0.0,0.39794,0.39794,0.39794,0.0,0.0,0.0,0.0,0.0
angels,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.522879,0.0
fools,0.0,0.0,0.0,0.0,0.0,0.0,0.39794,0.39794,0.39794,0.39794
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.522879,0.522879,0.0,0.522879


## **Document Lenght and Normalizaed TF-IDF**

### **Document Lenght**

In [278]:
def Calculate_DocLenght(TF_IDF):

    DocLenght = {}

    for i,col in enumerate(TF_IDF):
        
        DocLenght['doc'+str(i+1)] = np.sqrt( TF_IDF[col].apply(lambda x : x**2 ).sum() )   
            
    
    return pd.DataFrame(pd.Series(DocLenght), columns=['Document Lenght'])

In [279]:
DocLenght = Calculate_DocLenght(TF_IDF)
DocLenght.style.background_gradient(cmap='PuBu',axis=0)

Unnamed: 0,Document Lenght
doc1,1.373462
doc2,1.279618
doc3,0.498974
doc4,0.782941
doc5,0.582747
doc6,0.67427
doc7,1.223496
doc8,1.223496
doc9,1.106137
doc10,1.106137


### **Normalizaed TF-IDF**

In [280]:
TF_IDF['doc1']  / DocLenght.loc['doc1'].values 

antony       0.380701
brutus       0.380701
caeser       0.219176
calpurnia    0.000000
cleopatra    0.728087
mercy        0.219176
worser       0.289735
angels       0.000000
fools        0.000000
fear         0.000000
in           0.000000
rush         0.000000
to           0.000000
tread        0.000000
where        0.000000
Name: doc1, dtype: float64

In [281]:
def Calculate_Norm_TFIDF(TF_IDF, DocLenght):

    Norm_TFIDF = TF_IDF.copy()

    for col in Norm_TFIDF:
        
        Norm_TFIDF[col] = Norm_TFIDF[col] / DocLenght.loc[col].values               
    
    return Norm_TFIDF

In [282]:
Norm_TFIDF = Calculate_Norm_TFIDF(TF_IDF, DocLenght)

Norm_TFIDF.style.background_gradient(cmap='PuBu',axis=0)

Unnamed: 0,doc1,doc2,doc3,doc4,doc5,doc6,doc7,doc8,doc9,doc10
antony,0.380701,0.408621,0.0,0.0,0.0,0.775474,0.0,0.0,0.0,0.0
brutus,0.380701,0.408621,0.0,0.667839,0.0,0.0,0.0,0.0,0.0,0.0
caeser,0.219176,0.23525,0.0,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
calpurnia,0.0,0.781483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cleopatra,0.728087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mercy,0.219176,0.0,0.603298,0.384486,0.51657,0.446453,0.0,0.0,0.0,0.0
worser,0.289735,0.0,0.797516,0.508263,0.682869,0.0,0.0,0.0,0.0,0.0
angels,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.472707,0.0
fools,0.0,0.0,0.0,0.0,0.0,0.0,0.325248,0.325248,0.359756,0.359756
fear,0.0,0.0,0.0,0.0,0.0,0.0,0.427365,0.427365,0.0,0.472707


# **Cosine Similarity and Ranking**

## **Query Vectorizing Functions**

In [283]:
def Calculate_QueryTF(All_Text):

    TF = {token : 0 for token in All_Text}

    for i,doc in enumerate(All_Text):
        

        for word in doc.split():

            if TF[word] == 0:
                
                TF[word] = 1
            
            else:

                TF[word] += 1
    
    return pd.DataFrame(TF, index=['tf']).T

In [284]:
def Calculate_Query_TFWeightedLog(TF):

    TF_WightedLog_DF= TF.copy()

    for col in TF:
        TF_WightedLog_DF[col] = TF[col].apply(lambda x : 1 + np.log(x) if x!=0 else 0)

    TF_WightedLog_DF.rename(columns = {'tf':'w-log tf'}, inplace = True)
    
    return TF_WightedLog_DF

In [285]:
def get_Query_IDF_TFIDF(Query_Tokens, TF_WightedLog_DF):
    
    IDF_TFIDF = pd.DataFrame(index=Query_Tokens, columns= ['idf','tf-idf'])
    
    for token in Query_Tokens:
        IDF_TFIDF.loc[token]['idf'] = IDF.loc[token]['idf']
        
    TF_IDF = TF_WightedLog_DF['w-log tf'].multiply(IDF_TFIDF['idf'], axis=0)
    
    IDF_TFIDF['tf-idf'] = TF_IDF

    return IDF_TFIDF    

In [286]:
def Calculate_QueryLenght(Query_Tokens , TF_IDF):

    QueryLenght = pd.DataFrame(index=[0], columns= ['query lenght'])
       
    QueryLenght['query lenght'] = np.sqrt( TF_IDF.apply(lambda x : x**2 ).sum() )              
    
    return QueryLenght

In [287]:
def Calculate_Query_Norm_TFIDF(TF_IDF, QueryLenght):

    Norm_TFIDF = TF_IDF.copy()
        
    Norm_TFIDF = Norm_TFIDF / QueryLenght.loc[0].values

    Norm_TFIDF = pd.DataFrame(Norm_TFIDF)

    Norm_TFIDF.rename(columns = {'tf-idf':'normalized tf_idf'}, inplace = True)              
    
    return pd.DataFrame(Norm_TFIDF)

## **Testing Ranked Queries**

In [288]:
def get_Similarity_Ranks(query, Norm_TFIDF, Most_Matched_Docs):

    # Get Tokens

    Query_Tokens = Preprocess_Text(query)

    # Vectorize Query

    Tf = Calculate_QueryTF(Query_Tokens)
    
    Tf_wlog = Calculate_Query_TFWeightedLog(Tf)

    IDF_TFIDF = get_Query_IDF_TFIDF(Query_Tokens, Tf_wlog)

    queryLenght = Calculate_QueryLenght(Query_Tokens, IDF_TFIDF['tf-idf'])

    norm_tf_idf = Calculate_Query_Norm_TFIDF(IDF_TFIDF['tf-idf'], queryLenght)

    Query_DF = pd.concat([Tf, Tf_wlog, IDF_TFIDF ,norm_tf_idf],axis=1)
    

    display(Query_DF)
    
    display(queryLenght)

    
    # Get Similarity and Ranking

    Query_Cosine_Similarity = np.dot( norm_tf_idf.T, Norm_TFIDF.loc[Query_Tokens][Most_Matched_Docs] )

    Query_Cosine_Similarity =  pd.DataFrame(Query_Cosine_Similarity.T , columns=['Simialrity'] ,index=Norm_TFIDF.loc[Query_Tokens][Most_Matched_Docs].columns)
    
    Query_Cosine_Similarity = Query_Cosine_Similarity.sort_values(ascending=False ,by='Simialrity')
    
    display(Query_Cosine_Similarity)

    results = Query_Cosine_Similarity.index  

    return results

In [289]:
Postional_results

['doc1', 'doc2']

In [290]:
query = 'antony brutus'

results = get_Similarity_Ranks(query, Norm_TFIDF, Postional_results)

Unnamed: 0,tf,w-log tf,idf,tf-idf,normalized tf_idf
antony,1,1.0,0.522879,0.522879,0.707107
brutus,1,1.0,0.522879,0.522879,0.707107


Unnamed: 0,query lenght
0,0.739462


Unnamed: 0,Simialrity
doc2,0.577877
doc1,0.538393


In [291]:
print('Documents Ranked Based on Similarity: ')

Print_ResultFiles(results)

Documents Ranked Based on Similarity: 
doc2 : antony brutus caeser calpurnia 
doc1 : antony brutus caeser cleopatra mercy worser


# ---------------------------------------------------------------------------------

# **Combining Phases**

In [292]:
def RunQuery(query):

    # Getting Matched Documents From Postional Index

    Most_Matched_Docs = get_Postional_Matches(query)

    print('Postional Index Matched Documents: ')

    Print_ResultFiles(Most_Matched_Docs); print('\n')

    # Getting Documents Ranks from Cosine Similarity

    Ranked_Docs = get_Similarity_Ranks(query, Norm_TFIDF, Most_Matched_Docs)

    print('Matched Documents Ranked Based on Similarity: ')

    Print_ResultFiles(Ranked_Docs)   
    
    return Ranked_Docs

In [293]:
query = 'antony brutus'

result = RunQuery(query)

Query Tokens : ['antony', 'brutus']
Matched Document Files : ['doc1', 'doc2']


Postional Index Matched Documents: 
doc1 : antony brutus caeser cleopatra mercy worser
doc2 : antony brutus caeser calpurnia 




Unnamed: 0,tf,w-log tf,idf,tf-idf,normalized tf_idf
antony,1,1.0,0.522879,0.522879,0.707107
brutus,1,1.0,0.522879,0.522879,0.707107


Unnamed: 0,query lenght
0,0.739462


Unnamed: 0,Simialrity
doc2,0.577877
doc1,0.538393


Matched Documents Ranked Based on Similarity: 
doc2 : antony brutus caeser calpurnia 
doc1 : antony brutus caeser cleopatra mercy worser
