In [1]:
import re
import pickle
import os
import string
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import operator
import itertools
import webbrowser

In [2]:
#Importing the stored files
vocab = pd.read_pickle(r'./Storage/words.pkl')
inv_doc_freq=pd.read_pickle(r'./Storage/inv_doc_freq.pkl')
doc_vector = pd.read_pickle('./Storage/doc_vec.pkl','bz2')
zone = pd.read_pickle(r'./Storage/zone.pkl')
zone_vec= pd.read_pickle(r'./Storage/zone_vec.pkl')

In [3]:
#Creating the dataframe to store our query vector and zone vector
buffer = pd.read_pickle('./Storage/df.pkl','bz2')
buffer.drop(buffer.index, inplace=True)
buffer.loc[0]=0
zone_buffer = pd.read_pickle('./Storage/zone_df.pkl','bz2')
zone_buffer.drop(zone_buffer.index, inplace=True)
zone_buffer.loc[0]=0

In [4]:
def find_relevant(query,open_web):
    # Preprocessing the query to remove punctuations
    query = query.lower()
    query=query.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    query=query.translate(str.maketrans("‘’’–——−",'       '))

    # Tokenizing the query
    query_words = []
    query_words = word_tokenize(query)
    query_words=list(set(query_words))
    ################################################################################

    #Resetting buffer and zone_buffer
    buffer.loc[0]=0
    zone_buffer.loc[0]=0
    ################################################################################
    # Populating the query term frequncy dataframe
    threshold=0.1 #This is the idf below which which do not want to consider the words. Removes very frequent words from the zone.
    for token in query_words:
        buffer[token]+=1
        if (token in zone_buffer.columns and inv_doc_freq[token]>threshold):
            zone_buffer[token]+= inv_doc_freq[token]
    ################################################################################
    # Vectorizing the query doc frequnecy and calcualting weights
    query_vec=(1+np.log10(np.array(buffer.loc[0])))*list(inv_doc_freq.values())
    query_vec[query_vec==-np.inf]=0
    query_vec=query_vec/(np.sqrt(sum(query_vec**2)))
    # Convering NaN values to zero
    query_vec = np.nan_to_num(query_vec)

    # Vectorizing the query zone doc frequnecy and calcualting weights
    zone_query_vec=np.array(zone_buffer.loc[0])
    zone_query_vec=zone_query_vec/(np.sqrt(sum(zone_query_vec**2)))
    zone_query_vec = np.nan_to_num(zone_query_vec)
    ################################################################################
    # Computing scores for the query vector corresponding to each document
    scores = {}
    for doc_id, sub_vector in doc_vector.items():
        scores[doc_id] = np.sum(np.multiply(query_vec, sub_vector))
    #maxval stores the highest score recorded for document content matching
    #We are adding extra score if the title also matches
    maxval=max(scores.values())
    for doc_id, sub_vector in zone_vec.items():
        scores[doc_id] += np.sum(np.multiply(zone_query_vec, sub_vector))*maxval
    ################################################################################
    # Sorting scores in descending order
    sorted_scores = dict(sorted(scores.items(), key= operator.itemgetter(1), reverse=True))
    # Returning the top 10 results 
    return_docs = itertools.islice(sorted_scores.items(), 10)
    for k, v in return_docs:
        print(k,round(v,3),zone[k])
        #Opening the webpages in a browser for easy checking
        if(open_web):
            webbrowser.open('https://en.wikipedia.org/wiki?curid='+str(k))

In [5]:
# Taking query input
query = input("Type the query: ")
find_relevant(query,open_web=True)

615 0.181 American Football Conference
925 0.107 Asociación Alumni
1110 0.079 Demographics of American Samoa
1109 0.069 Geography of American Samoa
966 0.061 American shot
1111 0.056 Politics of American Samoa
1241 0.049 American (word)
951 0.048 Antigua and Barbuda
659 0.045 American National Standards Institute
600 0.043 Andorra
