In [25]:
import pandas as pd
import numpy as np
import os
import string
import re
from collections import Counter

In [26]:
PATH_BOOKS_DESCRIPTIONS = "./data/descriptions"
PATH_BOOKS_IF_CSV = "./data/IF_CSV"
PATH_CORPUS = "./data/corpus"

In [27]:
files = os.listdir(PATH_BOOKS_IF_CSV)
corpus_IF = pd.read_csv(f'{PATH_CORPUS}/corpus_IF.csv', names=['word', 'count'])

In [28]:
class LanguageModel:
    def __init__(self, d_IF, query, corpus_IF):
        self._d_IF = d_IF
        self._query = query.split()
        self._corpus_IF = corpus_IF
        self._lambda = 0.5
    
    def get_term_probability(self, term, d):
        Ld = len(d)
        tf = d.loc[d['word'] == term, 'count']
        if len(tf) == 0:
            return 0
        else:
            tf = tf.iloc[0]
        
        return tf/Ld
    
    def get_doc_probability(self):
        Pdq = 1
        for t in self._query:
            PMd = self.get_term_probability(t, self._d_IF)
            PMc = self.get_term_probability(t, self._corpus_IF)
            
            Pdq *= (1-self._lambda)*PMc + self._lambda*PMd
        
        return Pdq

In [29]:
class RelevanceFeedback:
    def __init__(self, original_query, documents, corpus_IF):
        self._original_query = original_query.split()
        self._modified_query = ''
        self._documents = documents
        self._corpus_IF = corpus_IF
        self._N = 10000
    
    def get_idf(self, term):
        df = pd.read_csv(f'{PATH_CORPUS}/df.csv', names=['word', 'count'])
        try:
            idf = np.log(self._N / df.loc[df['word'] == term, 'count'].iloc[0])
        except:
            return 0
        
        return idf
    
    def get_doc_relevant_words(self, d, max_words=1):
        tf_idf = {}
        d_IF = pd.read_csv(f'{PATH_BOOKS_IF_CSV}/{d}.csv', names=['word', 'count'])
        
        for index, row in d_IF.iterrows():
            c_term = row['word']
            c_tf = int(row['count'])
            tf_idf[c_term] = c_tf * self.get_idf(c_term)
        tf_idf = {k: v for k, v in sorted(tf_idf.items(), key=lambda item: item[1], reverse=True)}
        rel_words = list(tf_idf)[:max_words]
        
        return rel_words
    
    def get_feedback_words(self):
        fb_words = []
        print(self._documents)
        for d in self._documents:
            fb_words.append(self.get_doc_relevant_words(d))
        
        return [item for sublist in fb_words for item in sublist]
    
    def create_modified_query(self):
        fb_words = self.get_feedback_words()
        self._modified_query = list(set(self._original_query).union(set(fb_words)))
        
        return self._modified_query

In [30]:
class BookSearch:
    def __init__(self, query, corpus_IF):
        self._query = query
        self._book_rankings = {}
        
    def execute_query(self, query, max_docs=10):
        ranking = {}
        for index, file in enumerate(files):
            d_IF = pd.read_csv(f'{PATH_BOOKS_IF_CSV}/{file}', names=['word', 'count'])
            lm = LanguageModel(d_IF, query, corpus_IF)
            clean_name = file.replace('.csv', '')
            ranking[clean_name] = lm.get_doc_probability()
            del lm
        sorted_rankings = {k: v for k, v in sorted(ranking.items(), key=lambda item: item[1], reverse=True)}
        culled_rankings = list(sorted_rankings)[:max_docs]
        
        return culled_rankings
    
    def execute_relevance_feedback(self, query, rankings, docs_used=3):
        rf = RelevanceFeedback(query, rankings[:docs_used], corpus_IF)
        mod_query = rf.create_modified_query()
        mod_query_string = ' '.join(mod_query)
                               
        return mod_query_string

In [31]:
query = 'fantasy adventure'
bs = BookSearch(query, corpus_IF)

In [32]:
no_rf = bs.execute_query(query)

In [33]:
no_rf

['45', '4468', '7195', '8817', '8617', '6284', '2285', '2399', '8373', '5923']

In [34]:
mod_query = bs.execute_relevance_feedback(query, no_rf)

['45', '4468', '7195']


In [35]:
mod_query

'Pi fantasy Haroun Charlie adventure'

In [None]:
bs = BookSearch(mod_query, corpus_IF)
with_rf = bs.execute_query(mod_query)

In [None]:
with_rf