In [74]:
import numpy as np
import pandas as pd
import sys
import json
import pickle5 as pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download("punkt")

from rank_bm25 import BM25Okapi

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ahmetsenturk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ahmetsenturk/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [75]:
topics_file = open("topics.pickle", "rb")
all_documents_file = open("all_documents.pickle", "rb")

In [76]:
topics_dict = pickle.load(topics_file)
all_docs_dict = pickle.load(all_documents_file)

In [77]:
topics_df = pd.DataFrame(topics_dict)
all_docs_df = pd.DataFrame(all_docs_dict)

In [78]:
all_docs_df.drop_duplicates(subset=['document_id'], keep=False, inplace=True)

In [79]:
all_docs_df["text"] = all_docs_df["abstract"] + " " + all_docs_df["title"] 

In [80]:
stop_words = set(stopwords.words('english')) 
tokenizer = RegexpTokenizer(r'\w+')
all_docs_splitted = []
for text in all_docs_df["text"]:
    if text != "":
        filtered_text = []
        for t in tokenizer.tokenize(text):
            if t not in stop_words:
                filtered_text.append(t.lower())
        all_docs_splitted.append(filtered_text)

In [82]:
bm25 = BM25Okapi(all_docs_splitted)

In [87]:
results = pd.DataFrame()
train_results = pd.DataFrame()
test_results = pd.DataFrame()
for topic_index in range(0, len(topics_df.index)):
    query = topics_df.iloc[topic_index]["query"] + " " + topics_df.iloc[topic_index]["question"] + " " + topics_df.iloc[topic_index]["narrative"]
    filtered_query = []
    if query != "":
        for t in tokenizer.tokenize(query):
            if t not in stop_words:
                filtered_query.append(t.lower())
    
    query_results = bm25.get_scores(filtered_query)
    query_results_df = pd.DataFrame(query_results)
    query_results_df.columns = ["bm25 results"]
    query_results_df.insert(0, "topic_id", [(topic_index + 1) for x in range(0, len(query_results_df.index))])
    query_results_df.insert(1, "temp1", ["Q0" for x in range(0, len(query_results_df.index))])
    query_results_df.insert(2, "doc_id", [all_docs_df.iloc[i,0] for i in range(0, len(query_results_df.index))])
    query_results_df.insert(3, "rank", ["0" for x in range(0, len(query_results_df.index))])
    query_results_df["standard"] = "STANDARD"
    results = pd.concat([results, query_results_df], axis=0) 
    if topic_index % 2 == 0:
        test_results = pd.concat([train_results, query_results_df], axis=0) 
    else:
        train_results = pd.concat([test_results, query_results_df], axis=0)    
    

In [89]:
np.savetxt("./results.txt", results.values, fmt="%s")

In [90]:
np.savetxt("./test_results.txt", test_results.values, fmt="%s")
np.savetxt("./train_results.txt", train_results.values, fmt="%s")