# Relevance Feedback for Epistemonikos Dataset using Relevance Feedback Rocchio and BM25 
- For these experiments we used relevance feedback Rocchio and BM25 on the same used Epsitemonikos dataset to compare the results with active learning framework. 
- Documents ids, titles and abstracts were indexed in ElasticSearch to retrieve documents more efficiently when using neither BM25 or Roccio.

In [None]:
import re
import pandas as pd 
import numpy as np
import nltk 
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = list(stopwords.words('english'))
from elasticsearch import Elasticsearch
import json 

# start elastic search session 
es = Elasticsearch()


# Utils

In [None]:
def text_processing(doc_init):
    document = re.sub(r'\W', ' ', doc_init)
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
    document = re.sub(r'\d+', ' ', document)
    document = re.sub(r'[\µ\β\ε\χ2\χ²\δ\å\⁸\α]', ' ', document)
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    document = re.sub(r'^b\s+', '', document)
    document = document.lower()    
    return document.strip()

def average_precision(lista):
    precisions = []
    cum = 1 
    for i,x in enumerate(lista, start=1):
        if x ==1:
            precisions.append(cum/i)   
            cum +=1         
    return sum(precisions)/cum

def last_rel(y):
    lastrel = ''.join([str(x) for x in y]).rindex('1')
    return lastrel/len(y)

# load dataset

In [None]:

DATASET_DIR = '' # insert path where all files were downloaded

df_episte = pd.read_csv('{}/datasets/Epistemonikos_dataset.csv'.format(DATASET_DIR), sep=';')
df_episte.head()

In [None]:
# filter same matrices as active learning framework to make them comparable 
max_docs = 2200
min_docs = 5 

matrices = [m for m in list(df_episte.matrix.unique()) if len(df_episte[df_episte.matrix == m]) < max_docs and len(df_episte[df_episte.matrix == m]) > min_docs]


In [None]:
df_episte = df_episte[df_episte['matrix'].isin(matrices)]
df_episte.head()

## index data from queries 

In [None]:
with open('matrix_titles.json') as f:
    data = json.load(f)

In [None]:
idx2query = {}
query2idx = {}
dict_episte_queries = {}

count = 0 

# clean data and create idx2query and query2idx dicts 
for idx in data:
      
    if data[idx] != None:    
        idx2query[count] = idx
        query2idx[idx] = count 
        relevant_documents = list(df_episte[df_episte['matrix']==idx].document)        
        dict_episte_queries[idx] = {'title': data[idx], 'relevant_docs': relevant_documents}
        count +=1

## process data: 
Example ElasticSearch results 

In [None]:
queries = [dict_episte_queries[idx]['title'].lower() for idx in dict_episte_queries]

## Relevance Feedback

In [None]:
class Query_result(object):
    def __init__(self, pid, title_abstract, is_relevant = False):
        self.pid = pid 
        self.title_abstract = title_abstract
        self.is_relevant = False 
        

In [None]:
def search(query_terms, result_size):
    
    query = ' '.join(query_terms)
    
    
    res = res= es.search(index='episte_index',body={'query':{'match':{'title_abstract': query}}}, size = result_size)
    
    result = []
    
  
    for item in res['hits']['hits']:
        title_abstract = item['_source']['title_abstract']
        
        t = Query_result(item['_id'], title_abstract)
        
        result.append(t)
    
    return result 
    

In [None]:
def get_feedback(query_result, query_id):
    
    for i, v in enumerate(query_result):
        
        # receive feedback from ground truth
        if v.pid in dict_episte_queries[query_id]['relevant_docs']:
            v.is_relevant = True 
        
    return

In [None]:
def regularize(string):
    return [word for word in re.sub(r'[^a-zA-Z0-9_ ]', '', string).lower().strip().split()\
            if word not in STOPWORDS]

In [None]:
def modify_query(query_result, query, ORIGIN_QUERY, alpha=0.25, beta=0.5):
    N = len(query_result)
    re_vectors, irre_vectors, doc_freq = [], [], defaultdict(set)
    

    for i, v in enumerate(query_result):
        vector = defaultdict(int)
  
        terms = regularize(v.title_abstract) # terms: all terms in a document
    
        for term in terms:
            doc_freq[term].add(i)
            vector[term] += 1
            
        if v.is_relevant:
            re_vectors.append(vector)  
            
        else:
            irre_vectors.append(vector)         
    
    # After this loop, every vector, each representing a document,
    # will store the tf-idf value for each term in this document
    for vector in re_vectors + irre_vectors:
        for term in vector:
            vector[term] = math.log(1+vector[term], 10) * math.log(float(N)/len(doc_freq[term]), 10) * 10000
    
    # Rocchio Algorithm -- combine all relevant and irrelevant vectors
    DR, DNR = len(re_vectors), len(irre_vectors)
    new_vector = defaultdict(float)
    
    for vector in re_vectors:
        for term in vector:
            new_vector[term] += vector[term] * alpha / DR 
    
    for vector in irre_vectors:
        for term in vector:
            new_vector[term] = max(0, new_vector[term] - vector[term] * beta / DNR)
        
    # Find (up to) 2 "new" terms in new_vector and add them to query terms
    first, second, first_val, second_val = '', '', 0, 0
    for term in new_vector:
        if term not in query and new_vector[term] > 0: # pass terms that are already in query terms
            weight = new_vector[term]
            if weight > first_val:
                first, first_val, second, second_val = term, weight, first, first_val
            elif weight > second_val:
                second, second_val = term, weight
            else:
                pass
 
    if first: 
        query.append(first)

    if second: 
        query.append(second)    
        print("Augmenting by {}".format(first + ' ' + second))
    
    new_vector_words = []
    
    for t in query:
        if t not in ORIGIN_QUERY:
            new_vector_words.append([t, new_vector[t]])
    
    # sort by score 
    new_vector_words.sort(key = lambda x: x[1], reverse = True)
    
    # obtain words to concat to query 
    concat_words = [x[0] for x in new_vector_words]
    
    query = ORIGIN_QUERY.rstrip('\n').split() + concat_words

    return query

# start relevance feedback (Rocchio) 

In [None]:
def main_rocchio(): 
    
    total_results  = {}
    
    query_ids = [idx2query[i] for i in range(len(queries))]
    
    # start_result_dict
    for id_ in query_ids: 
        total_results[id_] = {'recall@10':[], 
                              'recall@20':[],
                              'recall@30':[],
                              'precision@10':[],
                             'precision@20':[],
                              'precision@30':[],
                              'avg_prec':[],
                              'lastrel%':[]
                             }
    
    
    for i in range(len(queries)):
        
        results = {}
    
        ORIGIN_QUERY = queries[i]

        ORIGIN_QUERY = ORIGIN_QUERY.lower()  # Record original query terms

        query = ORIGIN_QUERY.rstrip('\n').split()

        RESULT_SIZE = 20000 
        
        ITER = 20 

        QUERY_ID = idx2query[i]

        counter = 0
        
        recalls10 = []
        recalls20 = []
        recalls30 = []
        
        precisions10 = []
        precisions20 = []
        precisions30 = []
        
        avg_precisions = []
        lastrels = []
        

        while True:

            query_result = search(query, RESULT_SIZE)
            
            get_feedback(query_result, QUERY_ID)
                      
            pred = [1 if q.is_relevant else 0 for q in query_result]
              
            if sum(pred) > 0: 
            
                recall10 = sum(pred[0:10])/len(dict_episte_queries[QUERY_ID]['relevant_docs'])
                recall20 = sum(pred[0:20])/len(dict_episte_queries[QUERY_ID]['relevant_docs'])
                recall30 = sum(pred[0:30])/len(dict_episte_queries[QUERY_ID]['relevant_docs'])

                precision10 = sum(pred[0:10])/10
                precision20 = sum(pred[0:20])/20
                precision30 = sum(pred[0:30])/30

                avg_precision = average_precision(pred)
                lastrel = last_rel(pred)
                
                print('recall@10', recall10)
                print('recall@20', recall20)
                print('recall@30', recall30)
                
                print('precision@10', precision10)
                print('precision@20', precision20)
                print('precision@30', precision30)
                
                print('avg_precision', avg_precision)
                print('lastrel', lastrel)
                
                
            else: 
                recall10 = 0
                recall20 = 0
                recall30 = 0

                precision10 = 0
                precision20 = 0
                precision30 = 0

                avg_precision = average_precision(pred)
                lastrel = 1
                
                
            recalls10.append(recall10)
            recalls20.append(recall20)
            recalls30.append(recall30)

            precisions10.append(precision10)
            precisions20.append(precision20)
            precisions30.append(precision30)

            avg_precisions.append(avg_precision)
            lastrels.append(lastrel)
            
            if counter == ITER:  # finish iterating with this query 
                break

            query = modify_query(query_result, query, ORIGIN_QUERY)

            counter +=1 
        
        total_results[QUERY_ID]['recall@10'] = recalls10
        total_results[QUERY_ID]['recall@20'] = recalls20
        total_results[QUERY_ID]['recall@30'] = recalls30
        
        total_results[QUERY_ID]['precision@10'] = precisions10
        total_results[QUERY_ID]['precision@20'] = precisions20
        total_results[QUERY_ID]['precision@30'] = precisions30
        
        total_results[QUERY_ID]['avg_prec'] = avg_precisions
        total_results[QUERY_ID]['lastrel'] = lastrels
        
        
    return total_results



In [None]:
dict_results = main_rocchio()


In [None]:
with open('relevance_feedback_rocchio_epistemonikos.json', 'w') as f:
    json.dump(dict_results, f)

# relevance feedback BM25 

In [None]:
def main_bm25():
    total_results  = {}
    
    query_ids = [idx2query[i] for i in range(len(queries))]
    
    # start_result_dict
    for id_ in query_ids: 
        total_results[id_] = {'recall@10':[], 
                              'recall@20':[],
                              'recall@30':[],
                              'precision@10':[],
                             'precision@20':[],
                              'precision@30':[],
                              'avg_prec':[],
                              'lastrel%':[]
                             }
    
    
    for i in range(len(queries)):
        
        results = {}
    
        ORIGIN_QUERY = queries[i]

        ORIGIN_QUERY = ORIGIN_QUERY.lower()  # Record original query terms

        query = ORIGIN_QUERY.rstrip('\n').split()

        RESULT_SIZE = 20000 
        
        ITER = 20 

        QUERY_ID = idx2query[i]

        counter = 0
        
        recalls10 = []
        recalls20 = []
        recalls30 = []
        
        precisions10 = []
        precisions20 = []
        precisions30 = []
        
        avg_precisions = []
        lastrels = []
        
      
        while True:

            query_result = search(query, RESULT_SIZE)
            
            get_feedback(query_result, QUERY_ID)
           
            pred = [1 if q.is_relevant else 0 for q in query_result]
            
            if sum(pred) > 0: 
            
                recall10 = sum(pred[0:10])/len(dict_episte_queries[QUERY_ID]['relevant_docs'])
                recall20 = sum(pred[0:20])/len(dict_episte_queries[QUERY_ID]['relevant_docs'])
                recall30 = sum(pred[0:30])/len(dict_episte_queries[QUERY_ID]['relevant_docs'])

                precision10 = sum(pred[0:10])/10
                precision20 = sum(pred[0:20])/20
                precision30 = sum(pred[0:30])/30

                avg_precision = average_precision(pred)
                lastrel = last_rel(pred)
                
                print('recall@10', recall10)
                print('recall@20', recall20)
                print('recall@30', recall30)
                
                print('precision@10', precision10)
                print('precision@20', precision20)
                print('precision@30', precision30)
                
                print('avg_precision', avg_precision)
                print('lastrel', lastrel)
                
                
            else: 
                recall10 = 0
                recall20 = 0
                recall30 = 0

                precision10 = 0
                precision20 = 0
                precision30 = 0

                avg_precision = average_precision(pred)
                lastrel = 1
                
                
            recalls10.append(recall10)
            recalls20.append(recall20)
            recalls30.append(recall30)

            precisions10.append(precision10)
            precisions20.append(precision20)
            precisions30.append(precision30)

            avg_precisions.append(avg_precision)
            lastrels.append(lastrel)
            
            if counter == ITER:  #  finish iterating with this query 
                break

            counter +=1 
        
        total_results[QUERY_ID]['recall@10'] = recalls10
        total_results[QUERY_ID]['recall@20'] = recalls20
        total_results[QUERY_ID]['recall@30'] = recalls30
        
        total_results[QUERY_ID]['precision@10'] = precisions10
        total_results[QUERY_ID]['precision@20'] = precisions20
        total_results[QUERY_ID]['precision@30'] = precisions30
        
        total_results[QUERY_ID]['avg_prec'] = avg_precisions
        total_results[QUERY_ID]['lastrel'] = lastrels
        
        
    return total_results
    
    

In [None]:
dict_results_bm25 = main_bm25()

In [None]:
with open('relevance_feedback_bm25_epistemonikos.json', 'w') as f:
    json.dump(dict_results_bm25, f)