In [2]:
import numpy as np
import pandas as pd
import re
from math import log 

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
#loading dataset
DF= pd.read_csv('inaugural.csv',usecols=['yrprez','speech'])
DF.head(5)

Unnamed: 0,yrprez,speech
0,1789-Washington,Fellow - Citizens of the Senate and of the Hou...
1,1793-Washington,"Fellow citizens, I am again called upon by the..."
2,1797-Adams,"When it was first perceived, in early times, t..."
3,1801-Jefferson,Friends and Fellow Citizens: Called upon to un...
4,1805-Jefferson,"Proceeding, fellow citizens, to that qualifica..."


In [5]:
print(DF.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   yrprez  56 non-null     object
 1   speech  56 non-null     object
dtypes: object(2)
memory usage: 1.0+ KB
None


#Preprocess


In [6]:
def preprocess(data):
    '''
    takes data as input
    cleans data by 
        -removing all non alphanumeric terms
        -removes common english words 
        -uses lemmatization to find similiar meaning words
    
    returns clean data
    '''
    for i in range(len(data)):
    
        data[i]= re.sub(r'\W|[0-9+]',' ',data[i]) #matches any non-alphanumeric term 
        data[i]= re.sub(r'\s+',' ',data[i]) #matches extra whitespaces

        data[i]= data[i].lower().split() #tokenize the data

    sw= stopwords.words('english') #common english words
    for i in range(len(data)):
        data[i]=[word for word in data[i] if word not in sw] #Removing stop word
        lemma = WordNetLemmatizer()
        data[i]=[lemma.lemmatize(word) for word in data[i]] #Maps word to its base form
    
    return data


In [7]:
data = [i for i in DF['speech']]
data = preprocess(data) #preprocessing the data


#TF IDF TFIDF

In [8]:
def compute_doc_tf(doc):
    '''
    Computes term frequency for each word in a document 
    '''
    doctf={} #stores term frequency of each word
    
    #Counts frequency of each word in doc
    for word in doc:
        if word in doctf:
            doctf[word]+=1
        else:
            doctf[word]=1
    #calculates term frequecy of each word in doc
    for word in doctf:
        doctf[word]= log(1+doctf[word],10)
        
    return doctf 

In [9]:
tf=[] #stores list of dictionary containing term frequencies
for doc in data:
    tf.append(compute_doc_tf(doc))

In [10]:
doc_index={} #stores index of all document containing a word

#Computing index for each word in data
for i,doc in enumerate(data):
    for word in doc:
        try:
            doc_index[word].add(i)
        except:
            doc_index[word] = {i}

In [11]:
idf={} #stores inverse document frequency for each word

#computing freq of word in whole data
for doc in data:
        for word in doc:
            if word in idf:
                idf[word]+=1
            else:
                idf[word]=1

#calulating idf for each word
for word in idf:
    idf[word]=(log(len(data),10) - log(idf[word],10))

In [12]:
tfidf={} #stores the product of term frequency and inverse term frequency

#computing tfidf for each word
for i,doc in enumerate(tf):
    for word in doc:
        tfidf[i,word]= tf[i][word]*idf[word]

In [13]:
print(tf)



In [14]:
print(doc_index)



In [15]:
print(idf)



In [16]:
print(tfidf)



#Ranking

In [17]:
def rank_docs(query):
    '''
    Ranks each document w.r.t query passed as argument 
    Returns a dict containing relevance for each doc
    '''
    query=preprocess(query) #perprocessing query
    score={} #stores ranking score of each doc against query
    for key in tfidf:
        if key[1] in query[0]:
            try:
                score[key[0]]+= tfidf[key]
            except:
                score[key[0]] = tfidf[key]
                
    #sorting score dict in descending order
    score = sorted(score.items(),key= lambda x: x[1] ,reverse = True) 
    return score

In [18]:
query = input() #input query
query = [query] #string query to list query
scores = rank_docs(query) #scoring each doc against query

In the face of our common dangers, in this winter of our hardship, let us remember these timeless words. With hope and virtue, let us brave once more the icy currents, and endure what storms may come


In [19]:
result={} #stores index of all relevant docs with thier ranking score

i= 5 # no. of maximum relevant documents required

#calculating results
for doc in scores:
    if( ( doc[1] <= 0) or  (not i)) :
        break
    result[doc[0]] = doc[1]
    i-=1

In [20]:
#Printing relevance docs info
try:
    m1= min(result.values()) 
    m2= max(result.values())

    for key,value in result.items():
        if value <= 0:
            print("No match found")
            break
        print(DF.iloc[key]['yrprez'],end=' ')
        per= (value - m1)/(m1 + m2) #normalising score
        if per:
            print('%.2f'%(per*100),'%')
        else:
            print('%.2f'%value)
except:
    print('Invalid search sequence')


2009-Obama 1.12
