In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from scipy import sparse
import csv
import pickle
import numpy as np
from gensim.models import Word2Vec
import nltk

In [2]:
clean_docs = pd.read_csv('science_dataset/processed_documents.csv')
clean_docs = clean_docs.fillna('')
original_docs = pd.read_csv('science_dataset/original_documents.csv')
original_docs = original_docs.fillna('')
clean_queries = pd.read_csv("science_dataset/processed_queries.csv")

In [3]:
json_qrles_list = []
links = {}
qrels = []
with open('science_dataset/qrels.jsonl', 'r', encoding='utf-8') as f:
    # Read the file line by line
    
    for line in f:
        # Load each line as a JSON object
        data = json.loads(line)
        json_qrles_list.append(data)
        # Process the data (e.g., print, store in a list)
    for data in json_qrles_list:
        
        qid = str(data['qid'])
        answer_pids = data['answer_pids']
        links[qid] = answer_pids

In [16]:
# Get the document IDs and contents as separate lists
doc_ids = clean_docs['doc_id'].tolist()
doc_contents = clean_docs['text'].tolist()

# Create a list of dictionaries for tokenized_docs
tokenized_docs = []

for doc_id, doc_content in zip(doc_ids, doc_contents):
    tokenized_doc = {
        'doc_id': doc_id,
        'doc_content': nltk.word_tokenize(doc_content)
    }
    tokenized_docs.append(tokenized_doc)
with open("science_dataset/word_embedding/tokenized_docs.pkl", 'wb') as file:
    pickle.dump(tokenized_docs, file)
file.close()

In [4]:
with open('science_dataset/word_embedding/tokenized_docs.pkl', 'rb') as file:
    tokenized_docs = pickle.load(file)


In [55]:
# Train Word2Vec model on the tokenized documents
tokenized_docs_content = [doc['doc_content'] for doc in tokenized_docs]
model = Word2Vec(tokenized_docs_content, min_count = 2, vector_size = 100, window = 30, sg = 1)
with open("science_dataset/word_embedding/model_min_count=3,vector_size=100,window=20,sg=1.pkl", 'wb') as file:
    pickle.dump(model, file)
file.close()

In [61]:
# with open('science_dataset/word_embedding/model1.pkl', 'rb') as file:
#     model1 = pickle.load(file)
with open('science_dataset/word_embedding/model_min_count=3,vector_size=100,window=20,sg=1.pkl', 'rb') as file:
    model = pickle.load(file)
    

In [None]:
query_ids = clean_queries['query_id'].tolist()
query_contents = clean_queries['text'].tolist()

tokenized_queries = []

for query_id, query_content in zip(query_ids, query_contents):
    tokenized_query = {
        'query_id': query_id,
        'query_content': nltk.word_tokenize(query_content)
    }
    tokenized_queries.append(tokenized_query)
    
with open("science_dataset/word_embedding/tokenized_queries.pkl", 'wb') as file:
    pickle.dump(tokenized_queries, file)
file.close()


In [19]:
with open('science_dataset/word_embedding/tokenized_queries.pkl', 'rb') as file:
    tokenized_queries = pickle.load(file)


In [8]:
# Load the vectorizer object
with open('science_dataset/objects/vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)
# Load the vectorized_docs object
with open('science_dataset/objects/vectorized_docs.pkl', 'rb') as file:
    vectorized_docs = pickle.load(file)

In [62]:
combined_embeddings = []
for i, doc in enumerate(tokenized_docs):
    doc_vector = np.zeros(model.vector_size)
    words_count = 0
    for word in doc['doc_content']:
        if word in model.wv and word in vectorizer.vocabulary_:
            tfidf_score = vectorized_docs[i, vectorizer.vocabulary_[word]]
            word_vector = model.wv.get_vector(word)
            doc_vector += tfidf_score * word_vector
            words_count += 1
    if words_count > 0:
        doc_vector /= words_count
    combined_embeddings.append(doc_vector)
np.save("science_dataset/word_embedding/combined_embeddings_model_min_count=3,vector_size=100,window=20,sg=1.npy", combined_embeddings)

In [63]:
def runQuery(tokenized_query):
    query_vector = np.zeros(model.vector_size)  # Initialize the query vector

    # Vectorize the query using the same TfidfVectorizer
    vectorized_query = vectorizer.transform([' '.join(tokenized_query)])

    for word in tokenized_query:
        if word in model.wv and word in vectorizer.vocabulary_:
            tfidf_score = vectorized_query[0, vectorizer.vocabulary_[word]]  # Retrieve TF-IDF score
            word_vector = model.wv.get_vector(word)  # Retrieve Word2Vec embedding
            query_vector += tfidf_score * word_vector  # Combine TF-IDF and Word2Vec embeddings

    doc_embeddings = np.load("science_dataset/word_embedding/combined_embeddings_model_min_count=3,vector_size=100,window=20,sg=1.npy")  # Load combined document embeddings

    similarity_scores = cosine_similarity([query_vector], doc_embeddings)  # Calculate similarity scores

    results = list(enumerate(similarity_scores[0]))
    sorted_results = sorted(results, key=lambda x: x[1], reverse=True)

    top_k_results = []
    for res in sorted_results[:10]:
        top_k_results.append(tokenized_docs[res[0]])
    return top_k_results

In [27]:
def get_relevant_docs_from_qrels(query_id):
    key_str = str(query_id)
    value = links[key_str]
    return value

In [28]:
def calculate_mrr(relevant_docs, retrieved_docs):
    """
    Calculate the reciprocal rank for a single query.
    
    :param relevant_docs: A set of relevant document IDs.
    :param retrieved_docs: A list of retrieved document IDs.
    :return: The reciprocal rank for the query.
    """
    for rank, doc_id in enumerate(retrieved_docs, start=1):
        if doc_id in relevant_docs:
            return 1 / rank
    return 0

In [29]:
def calculate_map(relevant_documents_id, retrieved_documents_id, k):
    precision_sum = 0.0
    relevant_count = 0

    if len(relevant_documents_id) == 0 or len(retrieved_documents_id) == 0:
        return 0.0

    retrieved_documents_id = retrieved_documents_id[:k]

    for i, doc_id in enumerate(retrieved_documents_id, 1):
        if doc_id in relevant_documents_id:
            relevant_count += 1
            precision_sum += relevant_count / i

    avg_precision = precision_sum / min(len(relevant_documents_id), k)
    return avg_precision

In [30]:
def myRunQueries(tokenized_queries):
    ap_at_ak = 0
    total_reciprocal_rank = 0
    
    for query in tokenized_queries:
        print("Query ID:", query['query_id'])
        
        relevant = get_relevant_docs_from_qrels(query['query_id'])
        retrieved = runQuery(query['query_content'])
        retrieved_ids = []
        for doc in retrieved:
            retrieved_ids.append(doc['doc_id'])
            
        print("relevant: ", relevant)
        print("retrieved: ", retrieved_ids)
        
        map_score = calculate_map(relevant, retrieved_ids, 10)
        print("MAP:", map_score)
        print('-------------------------------------------------------------------------')

        ap_at_ak += map_score
        
        reciprocal_rank = calculate_mrr(relevant, retrieved_ids)
        total_reciprocal_rank += reciprocal_rank
    
    map_result = ap_at_ak / len(tokenized_queries)
    mrr = total_reciprocal_rank / len(tokenized_queries)
    
    print("Mean Average Precision (MAP):", map_result)
    print("Mean Reciprocal Rank (MRR):", mrr)
    
    return map_result

In [31]:
# finalResult = myRunQueries(tokenized_queries)
# model_min_count=1,vector_size=50,window=10,sg=0.

Query ID: 0
relevant:  [17292]
retrieved:  [21258, 30165, 35436, 17292, 37379, 15653, 10413, 29474, 2983, 37371]
MAP: 0.25
-------------------------------------------------------------------------
Query ID: 1
relevant:  [8854]
retrieved:  [11806, 28412, 14019, 29698, 13014, 28700, 38384, 31155, 1679, 33687]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
relevant:  [15041]
retrieved:  [12332, 18246, 5586, 12955, 1160, 18247, 4918, 980, 3444, 26844]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 3
relevant:  [24955]
retrieved:  [18785, 24955, 17422, 15949, 27103, 21074, 18203, 2736, 23620, 21646]
MAP: 0.5
-------------------------------------------------------------------------
Query ID: 4
relevant:  [2239]
retrieved:  [21680, 19622, 31595, 21681, 738, 1955, 14744, 37113, 25286, 5974]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 5
relevant:  [5559,

In [34]:
# finalResult = myRunQueries(tokenized_queries)
# model_min_count=3,vector_size=100,window=5,sg=1 best result

Query ID: 0
relevant:  [17292]
retrieved:  [15653, 14676, 6843, 21039, 30165, 28092, 33330, 17870, 2983, 37371]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 1
relevant:  [8854]
retrieved:  [11806, 13014, 14019, 28412, 29698, 25577, 38384, 38397, 27446, 7094]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
relevant:  [15041]
retrieved:  [12332, 12955, 5586, 26487, 18246, 980, 4953, 195, 4918, 1708]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 3
relevant:  [24955]
retrieved:  [24955, 27103, 21074, 18785, 17775, 18203, 8074, 23620, 28844, 12299]
MAP: 1.0
-------------------------------------------------------------------------
Query ID: 4
relevant:  [2239]
retrieved:  [21680, 2239, 21681, 16228, 19622, 3144, 35440, 5974, 14744, 16292]
MAP: 0.5
-------------------------------------------------------------------------
Query ID: 5
relevant:  [5559, 55

In [38]:
# finalResult = myRunQueries(tokenized_queries)
# model_min_count=2,vector_size=100,window=10,sg=1

Query ID: 0
relevant:  [17292]
retrieved:  [15653, 6843, 14676, 21039, 37371, 2983, 28092, 30165, 2489, 17870]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 1
relevant:  [8854]
retrieved:  [11806, 13014, 14019, 29698, 25577, 28412, 38384, 7094, 27446, 38397]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
relevant:  [15041]
retrieved:  [5586, 12332, 12955, 980, 18246, 26487, 195, 26844, 1708, 4953]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 3
relevant:  [24955]
retrieved:  [24955, 27103, 21074, 17775, 18203, 22185, 13761, 22266, 18785, 27687]
MAP: 1.0
-------------------------------------------------------------------------
Query ID: 4
relevant:  [2239]
retrieved:  [21680, 2239, 21681, 16228, 21682, 3144, 28967, 5974, 19622, 22576]
MAP: 0.5
-------------------------------------------------------------------------
Query ID: 5
relevant:  [5559, 5

In [41]:
# finalResult = myRunQueries(tokenized_queries)
# model_min_count=3,vector_size=100,window=10,sg=1

Query ID: 0
relevant:  [17292]
retrieved:  [10146, 32568, 11444, 35436, 21102, 6843, 7768, 15653, 27455, 30165]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 1
relevant:  [8854]
retrieved:  [9099, 10779, 2757, 30671, 12485, 5910, 2274, 10865, 32599, 34421]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
relevant:  [15041]
retrieved:  [7317, 24494, 18988, 16965, 3148, 28980, 5223, 7894, 8082, 23883]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 3
relevant:  [24955]
retrieved:  [10146, 24955, 3982, 786, 18815, 6968, 33379, 22266, 23620, 7029]
MAP: 0.5
-------------------------------------------------------------------------
Query ID: 4
relevant:  [2239]
retrieved:  [26250, 9981, 31593, 35740, 9972, 10077, 25440, 21957, 25441, 4467]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 5
relevant:  [5559, 5560, 

In [54]:
# finalResult = myRunQueries(tokenized_queries)
# model_min_count=2,vector_size=100,window=20,sg=1 new best result

Query ID: 0
relevant:  [17292]
retrieved:  [15653, 6843, 37371, 28092, 17870, 2489, 2983, 17292, 21039, 14676]
MAP: 0.125
-------------------------------------------------------------------------
Query ID: 1
relevant:  [8854]
retrieved:  [13014, 11806, 25577, 14019, 29698, 38384, 27446, 7094, 31081, 28742]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
relevant:  [15041]
retrieved:  [5586, 12332, 12955, 18246, 980, 26487, 195, 1425, 192, 20770]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 3
relevant:  [24955]
retrieved:  [24955, 27103, 21074, 17775, 23076, 21138, 22185, 22266, 18785, 18203]
MAP: 1.0
-------------------------------------------------------------------------
Query ID: 4
relevant:  [2239]
retrieved:  [21680, 16228, 21681, 21682, 2239, 5974, 21653, 3144, 5975, 29906]
MAP: 0.2
-------------------------------------------------------------------------
Query ID: 5
relevant:  [5559, 5

In [60]:
# finalResult = myRunQueries(tokenized_queries)
# model_min_count=2,vector_size=100,window=30,sg=1

Query ID: 0
relevant:  [17292]
retrieved:  [15653, 28092, 17292, 2489, 37371, 6843, 17870, 2983, 1401, 21039]
MAP: 0.3333333333333333
-------------------------------------------------------------------------
Query ID: 1
relevant:  [8854]
retrieved:  [13014, 11806, 29698, 25577, 14019, 38384, 27446, 7094, 28412, 38397]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
relevant:  [15041]
retrieved:  [5586, 12332, 12955, 18246, 26487, 980, 20770, 195, 3444, 192]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 3
relevant:  [24955]
retrieved:  [24955, 27103, 21074, 17775, 23076, 18785, 21138, 23620, 8074, 22266]
MAP: 1.0
-------------------------------------------------------------------------
Query ID: 4
relevant:  [2239]
retrieved:  [21680, 2239, 3144, 28967, 38962, 21681, 29407, 16228, 10378, 19622]
MAP: 0.5
-------------------------------------------------------------------------
Query ID: 5
releva

In [64]:
finalResult = myRunQueries(tokenized_queries)
# model_min_count=3,vector_size=100,window=20,sg=1 new new best result

Query ID: 0
relevant:  [17292]
retrieved:  [15653, 6843, 17870, 2983, 37371, 28092, 17292, 21039, 14676, 30165]
MAP: 0.14285714285714285
-------------------------------------------------------------------------
Query ID: 1
relevant:  [8854]
retrieved:  [13014, 11806, 25577, 14019, 29698, 38384, 27446, 31081, 7094, 38397]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
relevant:  [15041]
retrieved:  [5586, 12332, 12955, 18246, 980, 26487, 195, 1425, 26844, 3444]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 3
relevant:  [24955]
retrieved:  [24955, 27103, 21074, 17775, 23076, 18785, 21138, 22266, 18203, 27687]
MAP: 1.0
-------------------------------------------------------------------------
Query ID: 4
relevant:  [2239]
retrieved:  [21680, 21681, 2239, 16228, 3144, 21682, 5974, 29407, 28967, 10349]
MAP: 0.3333333333333333
-------------------------------------------------------------------------