<a href="https://colab.research.google.com/github/anjalinagel12/Google-colab-notebook/blob/master/Query_Search_using_inverse_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Import all the required libraries.

In [0]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import FreqDist
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

###Loading the data

In [0]:
df = pd.read_csv('nlp_hackathon_search.csv')

In [33]:
df.head()

Unnamed: 0,document,question,title,document_id
0,Twilight Princess was released to universal cr...,What kind of scores did Twilight Princess rece...,The_Legend_of_Zelda:_Twilight_Princess,4296602a1bfe11ea8f2b656571b1b549
1,"On 16 August 1960, Cyprus attained independenc...",What is the date that Cyrpus attained independ...,Cyprus,42840a9c1bfe11ea8f2b656571b1b549
2,Gladstone returned to power after the 1892 gen...,How long did Lord Salisbury remain as Prime Mi...,Queen_Victoria,4277df561bfe11ea8f2b656571b1b549
3,The emergence of resistance of bacteria to ant...,What is the purpose of antibiotic treatment?,Antibiotics,42902d7c1bfe11ea8f2b656571b1b549
4,Cetaceans were historically abundant around th...,What other species can be seen close to the sh...,Norfolk_Island,4272a11c1bfe11ea8f2b656571b1b549


In [39]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62399 entries, 0 to 62398
Data columns (total 4 columns):
document       62394 non-null object
question       62352 non-null object
title          62352 non-null object
document_id    62352 non-null object
dtypes: object(4)
memory usage: 1.9+ MB


###Check for Null Values

In [35]:
df.isnull().sum()


document        5
question       47
title          47
document_id    47
dtype: int64

In [40]:
print("Total Data:",len(df))


Total Data: 62399


In [0]:
#NUll values are very less 47/62399

In [0]:
df1=df.dropna()

In [45]:
len(df1)

62352

###lemmatization

In [49]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [48]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [54]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
text = tuple(df1['document'])
i_d = tuple(df1['document_id'])
questions = tuple(df1['question'])

In [0]:
lemmatizer = WordNetLemmatizer()
stopw = stopwords.words('english')

###Tokenizing the data and removing stop words

In [0]:
def process_doc(doc_list, doc_id):
    doc_to_terms = {} 
    for doc, id1  in zip(doc_list, doc_id):
        pattern = re.compile('[\W_]+')
        doc = pattern.sub(' ',doc)
        re.sub(r'[\W_]+','', doc)
        tokens = doc.lower().split()
        #stopw = stopwords.words('english')
        clean_tokens = [] #tokens[:]
        for token in tokens:
            if token not in stopw:
                token = token.replace(',','')
                token = token.strip('.')                
                clean_tokens.append(lemmatizer.lemmatize(token))
        if id1 in doc_to_terms.keys():
            doc_to_terms[id1].extend(clean_tokens)
        else:
            doc_to_terms[id1] = clean_tokens
        #if(id1 == '42840a9c1bfe11ea8f2b656571b1b549'):
        #    print("printing doc: ")
        #    print(doc)
        #    print("printing tokens: ")
        #    print(clean_tokens)
        #    print("printing total terms: ")
        #    print(doc_to_terms[id1])
    return doc_to_terms

In [0]:
terms = process_doc(text, i_d)
question_terms = process_doc(questions, i_d)


###Indexing a single document.

In [0]:
def index_to_doc(termlist):
    docIndex = {}
    for index, word in enumerate(termlist):
        if word in docIndex.keys():
            docIndex[word].append(index)
        else:
            docIndex[word] = [index]
    return docIndex

###Indexing all the documents

In [0]:
def make_indices(termlists):
    total = {}
    for doc in termlists.keys():
        total[doc] = index_to_doc(termlists[doc])
    return total

In [0]:
index = make_indices(terms)
question_index = make_indices(question_terms)

###Inverting the index

In [0]:
def fullIndex(regdex):
    total_index = {}
    for docId in regdex.keys():
        #print(docId)
        for word in regdex[docId].keys():
            #print(word)
            if word in total_index.keys():
                if docId in total_index[word].keys():
                    total_index[word][docId].extend(regdex[docId][word][:])
                else:
                    total_index[word][docId] = regdex[docId][word]
            else:
                total_index[word] = {docId: regdex[docId][word]}
    return total_index

###Indexing both documents and questions

In [0]:
f_index = fullIndex(index)
q_index = fullIndex(question_index)

###Querying the index for a single word

In [0]:
def one_word_query(word, invertedIndex):
    pattern = re.compile('[\W_]+')
    word = pattern.sub(' ',word)
    word = word.lower()
    word = lemmatizer.lemmatize(word)
    if word in invertedIndex.keys():
        return [filename for filename in invertedIndex[word].keys()]
    else:
        return []

###Querying the index for multiple words

In [0]:
def free_text_query(string, index):
    result = set(df1['document_id'])
    #stopw = stopwords.words('english')
    for word in string.split():
        word = word.lower()
        if word not in stopw:
            temp = set(one_word_query(word, index))
            result = result.intersection(temp)
    return set(result)

###The search utility to search for a perticular query

In [0]:
def search(string):
    stopw = stopwords.words('english')
    pattern = re.compile('[\W_]+')
    string = pattern.sub(' ',string)    
    result = set()
    result.update(free_text_query(string, q_index))
    result.update(free_text_query(string, f_index))
    #result = {*temp}
    return list(result)

In [69]:
search('What other species can be seen close to the shores of Norfolk Island?')


['4272a11c1bfe11ea8f2b656571b1b549']