In [20]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from scipy import sparse
import csv
import pickle
import numpy as np
from gensim.models import Word2Vec
import nltk
from sklearn.preprocessing import normalize


In [2]:
clean_docs = pd.read_csv('science_dataset/processed_documents.csv')
clean_docs = clean_docs.fillna('')
original_docs = pd.read_csv('science_dataset/original_documents.csv')
original_docs = original_docs.fillna('')
clean_queries = pd.read_csv("science_dataset/processed_queries.csv")

In [3]:
json_qrles_list = []
links = {}
qrels = []
with open('science_dataset/qrels.jsonl', 'r', encoding='utf-8') as f:
    # Read the file line by line
    
    for line in f:
        # Load each line as a JSON object
        data = json.loads(line)
        json_qrles_list.append(data)
        # Process the data (e.g., print, store in a list)
    for data in json_qrles_list:
        
        qid = str(data['qid'])
        answer_pids = data['answer_pids']
        links[qid] = answer_pids

In [4]:
with open('science_dataset/word_embedding/tokenized_docs.pkl', 'rb') as file:
    tokenized_docs = pickle.load(file)


In [5]:
# with open('science_dataset/word_embedding/model1.pkl', 'rb') as file:
#     model1 = pickle.load(file)
with open('science_dataset/word_embedding/model_min_count=3,vector_size=100,window=20,sg=1.pkl', 'rb') as file:
    model = pickle.load(file)
    

In [6]:
# Load the vectorizer object
with open('science_dataset/objects/vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)
# Load the vectorized_docs object
with open('science_dataset/objects/vectorized_docs.pkl', 'rb') as file:
    vectorized_docs = pickle.load(file)

In [7]:
with open('science_dataset/word_embedding/tokenized_queries.pkl', 'rb') as file:
    tokenized_queries = pickle.load(file)


In [8]:
combined_embeddings = np.load("science_dataset/word_embedding/combined_embeddings_model_min_count=3,vector_size=100,window=20,sg=1.npy")

In [64]:
# combined_embeddings = np.load("science_dataset/word_embedding/combined_embeddings_model_min_count=3,vector_size=100,window=20,sg=1.npy")

# Prepare data for Chroma
ids = [str(tokenized_docs[i]['doc_id']) for i, embedding in enumerate(combined_embeddings) if np.any(embedding)]
embeddings = [embedding.tolist() for embedding in combined_embeddings if np.any(embedding)]



KeyboardInterrupt: 

In [16]:
import pinecone

from pinecone import Pinecone

pc = Pinecone(api_key="5ba8ab95-63a1-437d-ac9f-4a2b9d830981")
index = pc.Index("science-word-embedding")

In [69]:
# Batch size for upsert (adjust as needed)
batch_size = 500

# Upsert in batches
for i in range(0, len(ids), batch_size):
    batch_ids = ids[i:i + batch_size]
    batch_embeddings = embeddings[i:i + batch_size]
    index.upsert(vectors=[{"id": id_, "values": vec} for id_, vec in zip(batch_ids, batch_embeddings)])
    print(f"Upserted batch {i//batch_size + 1}/{(len(ids) + batch_size - 1)//batch_size}")

print("Data upserted successfully.")

Upserted batch 1/688
Upserted batch 2/688
Upserted batch 3/688
Upserted batch 4/688
Upserted batch 5/688
Upserted batch 6/688
Upserted batch 7/688
Upserted batch 8/688
Upserted batch 9/688
Upserted batch 10/688
Upserted batch 11/688
Upserted batch 12/688
Upserted batch 13/688
Upserted batch 14/688
Upserted batch 15/688
Upserted batch 16/688
Upserted batch 17/688
Upserted batch 18/688
Upserted batch 19/688
Upserted batch 20/688
Upserted batch 21/688
Upserted batch 22/688
Upserted batch 23/688
Upserted batch 24/688
Upserted batch 25/688
Upserted batch 26/688
Upserted batch 27/688
Upserted batch 28/688
Upserted batch 29/688
Upserted batch 30/688
Upserted batch 31/688
Upserted batch 32/688
Upserted batch 33/688
Upserted batch 34/688
Upserted batch 35/688
Upserted batch 36/688
Upserted batch 37/688
Upserted batch 38/688
Upserted batch 39/688
Upserted batch 40/688
Upserted batch 41/688
Upserted batch 42/688
Upserted batch 43/688
Upserted batch 44/688
Upserted batch 45/688
Upserted batch 46/6

In [50]:
def runQuery(tokenized_query):
    # Generate query vector
    query_vector = np.zeros(model.vector_size)
    vectorized_query = vectorizer.transform([' '.join(tokenized_query)])
    for word in tokenized_query:
        if word in model.wv and word in vectorizer.vocabulary_:
            tfidf_score = vectorized_query[0, vectorizer.vocabulary_[word]]
            word_vector = model.wv[word]
            query_vector += tfidf_score * word_vector

    # Normalize the query vector
    query_vector = normalize([query_vector])[0]

    # Ensure query vector is in list format and values are floats
    query_vector = [float(x) for x in query_vector]

    # Query Pinecone index
    results = index.query(vector=query_vector, top_k=10, include_values=True)
    # print(results)
    # Retrieve top-k results
    top_k_results = [int(doc['id']) for doc in results['matches']]
    return top_k_results

In [46]:
# Example usage
tokenized_query = ["example", "query", "terms"]
retrieved_docs = runQuery(tokenized_query)
print("Retrieved documents:", retrieved_docs)


Retrieved documents: [293690, 293746, 294658, 302710, 294659, 144689, 194411, 318946, 170174, 306245]


In [30]:
def get_relevant_docs_from_qrels(query_id):
    key_str = str(query_id)
    value = links[key_str]
    return value

In [31]:
def calculate_mrr(relevant_docs, retrieved_docs):
    """
    Calculate the reciprocal rank for a single query.
    
    :param relevant_docs: A set of relevant document IDs.
    :param retrieved_docs: A list of retrieved document IDs.
    :return: The reciprocal rank for the query.
    """
    for rank, doc_id in enumerate(retrieved_docs, start=1):
        if doc_id in relevant_docs:
            return 1 / rank
    return 0

In [32]:
def calculate_map(relevant_documents_id, retrieved_documents_id, k):
    precision_sum = 0.0
    relevant_count = 0

    if len(relevant_documents_id) == 0 or len(retrieved_documents_id) == 0:
        return 0.0

    retrieved_documents_id = retrieved_documents_id[:k]

    for i, doc_id in enumerate(retrieved_documents_id, 1):
        if doc_id in relevant_documents_id:
            relevant_count += 1
            precision_sum += relevant_count / i

    avg_precision = precision_sum / min(len(relevant_documents_id), k)
    return avg_precision

In [52]:
def myRunQueries(tokenized_queries):
    ap_at_ak = 0
    total_reciprocal_rank = 0
    
    for query in tokenized_queries:
        print("Query ID:", query['query_id'])
        
        relevant = get_relevant_docs_from_qrels(query['query_id'])
        retrieved = runQuery(query['query_content'])
        retrieved_ids = []
        for doc in retrieved:
            retrieved_ids.append(doc)
            
        print("relevant: ", relevant)
        print("retrieved: ", retrieved_ids)
        
        map_score = calculate_map(relevant, retrieved_ids, 10)
        print("MAP:", map_score)
        print('-------------------------------------------------------------------------')

        ap_at_ak += map_score
        
        reciprocal_rank = calculate_mrr(relevant, retrieved_ids)
        total_reciprocal_rank += reciprocal_rank
    
    map_result = ap_at_ak / len(tokenized_queries)
    mrr = total_reciprocal_rank / len(tokenized_queries)
    
    print("Mean Average Precision (MAP):", map_result)
    print("Mean Reciprocal Rank (MRR):", mrr)
    
    return map_result

In [53]:
myRunQueries(tokenized_queries)

Query ID: 0
relevant:  [17292]
retrieved:  [15653, 6843, 17870, 2983, 37371, 28092, 21039, 17292, 14676, 30165]
MAP: 0.125
-------------------------------------------------------------------------
Query ID: 1
relevant:  [8854]
retrieved:  [13014, 11806, 25577, 14019, 29698, 38384, 27446, 31081, 7094, 28412]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 2
relevant:  [15041]
retrieved:  [5586, 12332, 12955, 18246, 980, 26487, 195, 1425, 26844, 3444]
MAP: 0.0
-------------------------------------------------------------------------
Query ID: 3
relevant:  [24955]
retrieved:  [24955, 27103, 21074, 17775, 23076, 18785, 21138, 22266, 18203, 27687]
MAP: 1.0
-------------------------------------------------------------------------
Query ID: 4
relevant:  [2239]
retrieved:  [21680, 21681, 2239, 16228, 3144, 21682, 5974, 29407, 28967, 10349]
MAP: 0.3333333333333333
-------------------------------------------------------------------------
Query ID: 5
r

0.1500387324487829