In [2]:
# pandas to read the final dataframe of the books 
import pandas as pd 

# pickle package to import the various files (e.g. vocabulary, inverted_index, ...)
import pickle 


# nltk packages for cleaning the plots 
import nltk as nl
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import WordNetLemmatizer

We import the vocabulary and the inverted index:

In [3]:
with open( 'vocabulary.pkl', 'rb') as f:
        vocabulary= pickle.load(f)
        f.close()


In [4]:
with open( 'Inverted_Index.pkl', 'rb') as f:
        ID= pickle.load(f)
        f.close()

We import the clenaed dataframe of the books: 

In [5]:
df=pd.read_csv('dataset/Dataset_300Pages.tsv', sep='\t')

df.dropna(subset=['Plot'],inplace=True)

df.reset_index(drop=True,inplace=True)

In [6]:
df.head(3)

Unnamed: 0,document_ID,bookTitle,bookSeries,bookAuthors,ratingValue,ratingCount,reviewCount,Plot_Values,Plot,NumberofPages,Publishing_Date,Characters,Setting,Url
0,0,MARS,,Jasmine Rose,4.38,69,13,"{5: '47', 4: '7', 3: '11', 2: '2', 1: '2'}",❝�� my heart has become a planetand you are th...,,2014,,,https://www.goodreads.com/book/show/23279048-mars
1,1,Black Box,,Cassia Leo,4.02,6244,903,"{5: '2297', 4: '2320', 3: '1181', 2: '345', 1:...",♥️ Three fateful encounters....♥️ Two heart-br...,400.0,February 28th 2014,"['Mikki Gladstone', 'William ""Crush"" Slayer']","['Boston, Massachusetts']",https://www.goodreads.com/book/show/29539518-b...
2,2,Ruin and Rising,The Shadow and Bone Trilogy #3,Leigh Bardugo,4.09,158624,19396,"{5: '62107', 4: '59607', 3: '27962', 2: '6810'...",▶ \nAlternative Cover Edition #1\nThe capital ...,422.0,June 17th 2014,"['Alina Starkov', 'Malyen Oretsev', 'Darkling'...",['Ravka '],https://www.goodreads.com/book/show/14061957-r...


We will also need these functions to run the query:  

In [7]:
def clean_info (string):
    # this command will split the string given in input in substrings by using 
    # the words given to RegexpTokenizer as argument
    

    
    # filter the punctuation
    tmp = nl.RegexpTokenizer(r"['\w-]+").tokenize(string)  
    
    #first we lower all the words otherwise words such as AND,IS,MY are not consider stopwords 
    tmp = [word.lower() for word in tmp]
    
     # filter the stopwords (e.g. 'the', 'my', 'an', ...)
    tmp = [word for word in tmp if not word in stopwords.words("english")]
    
    #we lemmatize all the words (e.g. 'dirn')
    lemma = WordNetLemmatizer()
    tmp = [lemma.lemmatize(word, pos = "v") for word in tmp]    # v for verbs
    tmp = [lemma.lemmatize(word, pos = "n") for word in tmp]    # n for nouns
    
    
    final = ' '.join(tmp)
    
    return final

In [8]:
# we will need first these two functions to implement the search function 


# this function takes as input a list of lists and gives back the index of the list that has minimus first element
def find_min_list (L):
    min_elem = L[0][0]
    count = 0
    index = 0
    for l in L[1:]: 
        if min_elem > l[0]:
            count = count + 1 
            index = count 
            min_elem = l[0]
        else: 
            count = count + 1 

    return index 
        

# this function takes as input a list of lists and gives back the list created from the intersection of the lists 
def intersect_list (L):
    results = []
    while all(len(l) > 0 for l in L):

        if all([L[0][0] == l[0] for l in L[1:]]):     
            results.append(L[0][0])
            L = [l[1:] for l in L]

        else : 
            min_index = find_min_list(L)
            L[min_index] = L[min_index][1:]

    return results 

This is the function that finds the documents containing the query and prints their Title, Plot and URL:  

In [32]:
from tabulate import tabulate
print(tabulate(df, headers='keys', tablefmt='psql'))

In [36]:
def find_query(inverted_index, vocabulary, data_frame):
    
    string = input() # asks the user a string of words to look up 
    
    string_cleaned = clean_info(string)
    list_words = string_cleaned.split(' ')
    
    
    # now translate the list of words in term_id )
    list_termID = []
    for word in list_words: 
        list_termID.append(vocabulary.get(word))    
        
   

    # retrieve the documents in the inverted index and collect them in a list 
    list_documents = []
    for term_id in list_termID:
        if term_id in inverted_index.keys():   # checking if the word we are looking for is in the inverted_index
            list_documents.append(inverted_index.get(term_id))
    
     
    # now intersect these lists (here it's fundamental to suppose that the documents are collected as increasing sequences)
    results = intersect_list(list_documents)
    
    # say to pandas to print the full plot 
    pd.set_option('display.max_colwidth', None)
    
    # print the results 
    df = data_frame.iloc[results][['bookTitle', 'Plot', 'Url']]
    print(df.to_markdown)
    
    
    return 
    
    


Let's see some examples that implement the function: 

In [37]:
find_query(ID, vocabulary, df)

heart darkness
<bound method DataFrame.to_markdown of                                                    bookTitle  \
3                                            Shadow and Bone   
114                                            Tears of Tess   
192    Written In Blood (Book One Of The Unnatural Brethren)   
230                                                     Sire   
250                                         Twisted Together   
...                                                      ...   
25217                                             Odd Thomas   
25409  The Beast That Shouted Love at the Heart of the World   
25435                                       Promise Me Light   
25525                                      Souls Unfractured   
26075                               The Heights of Perdition   

                                                                                                                                                                                 