In [1]:
import numpy as np
import pandas as pd
import sys
import json
import pickle5 as pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
topics_file = open("./topics.pickle", "rb")
all_documents_file = open("./all_documents.pickle", "rb")

In [3]:
topics_dict = pickle.load(topics_file)
all_docs_dict = pickle.load(all_documents_file)

In [4]:
topics_df = pd.DataFrame(topics_dict)
all_docs_df = pd.DataFrame(all_docs_dict)

In [5]:
all_docs_df.drop_duplicates(subset=['document_id'], keep=False, inplace=True)

In [6]:
all_docs_df["text"] = all_docs_df["abstract"] + " " + all_docs_df["title"]

In [7]:
tf_vectorizer = TfidfVectorizer(analyzer = "word", stop_words = "english")
all_docs_vec = tf_vectorizer.fit_transform(all_docs_df['text'])

In [8]:
tfidf_tokens = tf_vectorizer.get_feature_names()

In [9]:
results = pd.DataFrame()
train_results = pd.DataFrame()
test_results = pd.DataFrame()
for topic_index in range(0, len(topics_df.index)):
    query = topics_df.iloc[topic_index]["query"] + " " + topics_df.iloc[topic_index]["question"] + " " + topics_df.iloc[topic_index]["narrative"]
    query_vec = tf_vectorizer.transform([query])
    query_results = cosine_similarity(all_docs_vec, query_vec).reshape((-1,))
    query_results_df = pd.DataFrame(query_results)
    query_results_df.columns = ["cosine_sim"]
    query_results_df.insert(0, "topic_id", [(topic_index + 1) for x in range(0, len(query_results_df.index))])
    query_results_df.insert(1, "temp1", ["Q0" for x in range(0, len(query_results_df.index))])
    query_results_df.insert(2, "doc_id", [all_docs_df.iloc[i,1] for i in range(0, len(query_results_df.index))])
    query_results_df.insert(3, "rank", ["0" for x in range(0, len(query_results_df.index))])
    query_results_df["standard"] = "STANDARD"
    results = pd.concat([results, query_results_df], axis=0) 
    if topic_index % 2 == 0:
        train_results = pd.concat([train_results, query_results_df], axis=0) 
    else:
        test_results = pd.concat([test_results, query_results_df], axis=0) 

In [11]:
np.savetxt("./results.txt", results.values, fmt="%s")

In [12]:
np.savetxt("./test_results.txt", test_results.values, fmt="%s")
np.savetxt("./train_results.txt", train_results.values, fmt="%s")