In [1]:
import pandas as pd
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
import string
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from scipy import sparse
from autocorrect import Speller
spell = Speller(lang='en')
import csv
import pickle

In [2]:
clean_docs = pd.read_csv('science_dataset/processed_documents.csv')
clean_docs = clean_docs.fillna('')
original_docs = pd.read_csv('science_dataset/original_documents.csv')
original_docs = original_docs.fillna('')
clean_queries = pd.read_csv("science_dataset/processed_queries.csv")

In [5]:
json_qrles_list = []
links = {}
qrels = []
with open('science_dataset/qrels.jsonl', 'r', encoding='utf-8') as f:
    # Read the file line by line
    
    for line in f:
        # Load each line as a JSON object
        data = json.loads(line)
        json_qrles_list.append(data)
        # Process the data (e.g., print, store in a list)
    for data in json_qrles_list:
        
        qid = str(data['qid'])
        answer_pids = data['answer_pids']
        links[qid] = answer_pids

In [2]:
def dataProcessing(dfText):
    # spell correct
    # texts = " ".join([spell(w)for w in (word_tokenize(str(dfText)))])
    # To lower case
    texts = dfText.str.lower()
    
    # Remove punctuation
    trans_table = str.maketrans(string.punctuation, ' '*len(string.punctuation))  
    texts = [str(word).translate(trans_table) for word in texts]

    # Remove stopwrods
    stop_words = set(stopwords.words('english'))
    texts = [[word for word in q.split() if word not in stop_words] for q in texts]
    
    # Lemmatize text
    lemmatizer = WordNetLemmatizer()
    lemmatized_strings = []
    for text in texts:
        words = word_tokenize(str(text))
        lemmatized_words = []
        for word in text:
            x = pos_tag([word])
            my_pos = wordnet.NOUN
            if x[0][1][0].lower() == 'v':
                my_pos = wordnet.VERB
            lemmatized_words.append(lemmatizer.lemmatize(word, pos = my_pos))
        lemmatized_strings.append(' '.join(lemmatized_words))
    texts = lemmatized_strings

    
    # Remove Non-alphanumeric Characters
    texts = [re.compile('[^a-zA-Z0-9\s]').sub('', str(word)) for word in texts]
    
    return texts

In [7]:
vectorizer = TfidfVectorizer(stop_words='english')

In [None]:
vectorized_docs = vectorizer.fit_transform(clean_docs['text'])

In [13]:
# Store the vectorizer object
with open('science_dataset/objects/vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

# Store the vectorized_docs object
with open('science_dataset/objects/vectorized_docs.pkl', 'wb') as file:
    pickle.dump(vectorized_docs, file)

In [10]:
# Load the vectorizer object
with open('science_dataset/objects/vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

# Load the vectorized_docs object
with open('science_dataset/objects/vectorized_docs.pkl', 'rb') as file:
    vectorized_docs = pickle.load(file)

In [11]:
def runQueryWithAllDocsVector(query):
   
    vectorized_query = vectorizer.transform([query[0]])

    #Calculate cosine similarity for vectorized_query and vectorized_docs
    similarity_scores = cosine_similarity(vectorized_query, vectorized_docs)
    
    # this result contains objects like this (3796, 0.9705645380366313) first attribute (index in realted docs)
    # second attribute (similarity score)
    results = list(enumerate(similarity_scores[0]))
    
    # sort results by score (descending) from higher score to lower score 
    sorted_results = sorted(results, key=lambda x: x[1], reverse=True)
    
    # fetch real docs with ids from original docs and scores for each doc
    
    result = []
    i=0
    for res in sorted_results:
        if i <10:
            result.append(original_docs['doc_id'][res[0]])
            i+=1
        else:
            break
    return result[:10]
   

In [13]:
query = ["many valence electron 5th shell hold"]
result = runQueryWithAllDocsVector(query)
print(result)

[18246, 21641, 3444, 13178, 195, 28250, 941, 4953, 980, 192]


In [14]:
def get_relevant_docs_from_qrels(query_id):
    key = query_id  # The key as an integer
    key_str = str(key)  # Convert the integer key to a string

    value = links[key_str]  # Accessing the value using the string key
    # print(value)  # Output: [1, 2, 3, 4]
    # relevent_docs_from_qrels = set()
    # for id in value:
    #     relevent_docs_from_qrels.add(id)
    return value
    

In [15]:
def calculate_map(relevant_documents_id, retrieved_documents_id, k):
    precision_sum = 0.0
    relevant_count = 0

    if len(relevant_documents_id) == 0 or len(retrieved_documents_id) == 0:
        return 0.0

    # Restrict the retrieved documents to the top k
    retrieved_documents_id = retrieved_documents_id[:k]

    for i, doc_id in enumerate(retrieved_documents_id, 1):
        if doc_id in relevant_documents_id:
            relevant_count += 1
            precision_sum += relevant_count / i

    avg_precision = precision_sum / min(len(relevant_documents_id), k)
    return avg_precision

In [16]:
def myRunQueriesForMap(queries):
    ap_at_ak = 0
    for query in queries:
        print("query id: ",query[0])
        
        relevant                   =     get_relevant_docs_from_qrels(query[0])
        # print(relevant)
        
        retrieved                  =     runQueryWithAllDocsVector([query[1]])
        
        print("relevant: ",relevant)
        print("retrieved: ",retrieved)
        calc_result                =     map_score = calculate_map(relevant, retrieved, 10)
        print("AP:", map_score)
        print('-------------------------------------------------------------------------')

        ap_at_ak                   +=   calc_result
    return ap_at_ak/len(queries)
        

In [35]:
finalResult = myRunQueriesForMap(clean_queries.values)

query id:  0
relevant:  [17292]
retrieved:  [17292, 6843, 14676, 2489, 86821, 60450, 21039, 45160, 56417, 53181]
AP: 1.0
-------------------------------------------------------------------------
query id:  1
relevant:  [8854]
retrieved:  [11806, 28412, 29698, 38181, 13014, 21217, 38507, 14019, 7094, 38384]
AP: 0.0
-------------------------------------------------------------------------
query id:  2
relevant:  [15041]
retrieved:  [4265, 195, 12332, 8123, 5586, 12955, 4953, 1657, 20770, 18246]
AP: 0.0
-------------------------------------------------------------------------
query id:  3
relevant:  [24955]
retrieved:  [24955, 27103, 21074, 17775, 18785, 235880, 248924, 279563, 37692, 32536]
AP: 1.0
-------------------------------------------------------------------------
query id:  4
relevant:  [2239]
retrieved:  [21680, 3974, 3144, 14514, 152994, 38962, 42903, 172994, 122897, 109268]
AP: 0.0
-------------------------------------------------------------------------
query id:  5
relevant:

In [36]:
print("MAP: ", finalResult)

MAP:  0.20170624383582436


In [1]:
def calculate_recall_at_k(relevant_documents_id, retrieved_documents_id, k):
    if len(relevant_documents_id) == 0 or len(retrieved_documents_id) == 0:
        return 0.0

    # Restrict the retrieved documents to the top k
    retrieved_documents_id = retrieved_documents_id[:k]

    relevant_count = len(set(relevant_documents_id) & set(retrieved_documents_id))
    recall = relevant_count / len(relevant_documents_id)

    return recall

In [38]:
def myRunQueriesForRecall(queries, k):
    recall_at_k = 0
    for query in queries:
        print("Query ID:", query[0])
        
        relevant = get_relevant_docs_from_qrels(query[0])
        retrieved = runQueryWithAllDocsVector([query[1]])
        
        print("Relevant:", relevant)
        print("Retrieved:", retrieved)
        
        recall = calculate_recall_at_k(relevant, retrieved, k)
        print("Recall at K:", recall)
        print('-------------------------------------------------------------------------')
        
        recall_at_k += recall
    
    return recall_at_k / len(queries)

In [40]:
finalResult = myRunQueriesForRecall(clean_queries.values,10)

Query ID: 0
Relevant: [17292]
Retrieved: [17292, 6843, 14676, 2489, 86821, 60450, 21039, 45160, 56417, 53181]
Recall at K: 1.0
-------------------------------------------------------------------------
Query ID: 1
Relevant: [8854]
Retrieved: [11806, 28412, 29698, 38181, 13014, 21217, 38507, 14019, 7094, 38384]
Recall at K: 0.0
-------------------------------------------------------------------------
Query ID: 2
Relevant: [15041]
Retrieved: [4265, 195, 12332, 8123, 5586, 12955, 4953, 1657, 20770, 18246]
Recall at K: 0.0
-------------------------------------------------------------------------
Query ID: 3
Relevant: [24955]
Retrieved: [24955, 27103, 21074, 17775, 18785, 235880, 248924, 279563, 37692, 32536]
Recall at K: 1.0
-------------------------------------------------------------------------
Query ID: 4
Relevant: [2239]
Retrieved: [21680, 3974, 3144, 14514, 152994, 38962, 42903, 172994, 122897, 109268]
Recall at K: 0.0
------------------------------------------------------------------

In [41]:
print("Recall: ", finalResult)

Recall:  0.34082556714855


In [29]:
def calculate_mrr(relevant_docs, retrieved_docs):
    """
    Calculate the reciprocal rank for a single query.
    
    :param relevant_docs: A set of relevant document IDs.
    :param retrieved_docs: A list of retrieved document IDs.
    :return: The reciprocal rank for the query.
    """
    for rank, doc_id in enumerate(retrieved_docs, start=1):
        if doc_id in relevant_docs:
            return 1 / rank
    return 0

In [30]:
def myRunQueriesForMRR(queries):
    mrr_sum = 0.0
    for query in queries:
        print("Query ID:", query[0])
        
        relevant = get_relevant_docs_from_qrels(query[0])
        retrieved = runQueryWithAllDocsVector([query[1]])
        
        print("Relevant:", relevant)
        print("Retrieved:", retrieved)
        
        mrr = calculate_mrr(relevant, retrieved)
        print("MRR:", mrr)
        print('-------------------------------------------------------------------------')
        
        mrr_sum += mrr
    
    return mrr_sum / len(queries)

In [36]:
finalResult = myRunQueriesForMRR(clean_queries.values)

Query ID: 0
Relevant: [17292]
Retrieved: [17292, 6843, 14676, 2489, 86821, 60450, 21039, 45160, 56417, 53181]
MRR: 1.0
-------------------------------------------------------------------------
Query ID: 1
Relevant: [8854]
Retrieved: [11806, 28412, 29698, 38181, 13014, 21217, 38507, 14019, 7094, 38384]
MRR: 0
-------------------------------------------------------------------------
Query ID: 2
Relevant: [15041]
Retrieved: [4265, 195, 12332, 8123, 5586, 12955, 4953, 1657, 20770, 18246]
MRR: 0
-------------------------------------------------------------------------
Query ID: 3
Relevant: [24955]
Retrieved: [24955, 27103, 21074, 17775, 18785, 235880, 248924, 279563, 37692, 32536]
MRR: 1.0
-------------------------------------------------------------------------
Query ID: 4
Relevant: [2239]
Retrieved: [21680, 3974, 3144, 14514, 152994, 38962, 42903, 172994, 122897, 109268]
MRR: 0
-------------------------------------------------------------------------
Query ID: 5
Relevant: [5559, 5560, 556

In [37]:
print("MRR final result: ", finalResult)

MRR final result:  0.3204431462795775
