## Install and Import

In [None]:
!pip install -r requirements.txt

In [None]:
import nltk 
import numpy as np
import pandas as pd
import pickle
import nltk
import string
import collections
import re
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize 
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
import operator
nltk.download('stopwords')
nltk.download('punkt')
import operator
from sklearn.metrics.pairwise import cosine_similarity
import heapq
from nltk.corpus import wordnet as wn
from gensim.models import Word2Vec
from gensim.models import KeyedVectors 
from biobert_embedding.embedding import BiobertEmbedding
from rank_bm25 import BM25Okapi
from keybert import KeyBERT
import xml.etree.ElementTree as ET

def download_nltk_packages():
    nltk.download('wordnet')
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('averaged_perceptron_tagger')

download_nltk_packages()    

## Preprocessing

In [None]:
#stop word keys formed
STOP_WORDS = set(stopwords.words('english'))
TOPICS= {}
CORONA_SYN = {"corona", "2019-ncov","sarscov-2", "covid-19","sars-cov-2" ,"sars-cov2" ,"sars-cov", "covid"}
#stop word keys formed extra punctuated version for covid
for key in string.punctuation:
    CORONA_SYN.add("covid"+key+"19")

In [None]:
#stop word removal
def remove_stopwords(word_tokens):
    filtered_sentence = [w for w in word_tokens if not w in STOP_WORDS]
    return filtered_sentence

#punctuation removal
def remove_punct(text):
    trans_table = {key: " " for key in string.punctuation+"’"}
    table = str.maketrans(trans_table)
    text = text.translate(table)
    return text
    
#synonym replace from CORONA_SYN along with other significant phrases
def replace_syn(text):
    
    newtext=text

    if 'wuhan virus' in text:
        splittext=text.split('wuhan virus')
        newtext=splittext[0] + 'coronavirus' + splittext[1]

    if 'chinese flu' in text:
        splittext=text.split('chinese flu')
        newtext=splittext[0] + 'coronavirus' + splittext[1]

    if 'covid 19' in text:
        splittext=text.split('covid 19')
        newtext=splittext[0] + 'coronavirus' + splittext[1]
    
    if 'corono virus' in text:
        splittext=text.split("corona virus")
        newtext=splittext[0] + 'coronavirus' + splittext[1]


    return " ".join(["coronavirus" if w in CORONA_SYN else w for w in word_tokenize(newtext)])

In [None]:
#returns preprocess string formed by word tokens 
def preprocess(text):
   
    if type(text) != str:
        print(text)
    else:
        text = text.lower()
        text = replace_syn(text)
        text = remove_punct(text)
        text = re.sub(r'\d+', '', text)
        word_tokens = word_tokenize(text)
        word_tokens = remove_stopwords(word_tokens)
        return " ".join(word_tokens)

In [None]:
#preprocess utilized on metadata.The ones come after year 2019 are significant. 

metadata = pd.read_csv("metadata.csv")
metadata = metadata[metadata["publish_time"].apply( lambda x: int(x.split("-")[0] ) >=2019 if not pd.isnull(x) else True) ]
metadata = metadata[['cord_uid', 'title', 'abstract']].copy()
metadata = metadata.dropna(subset=["abstract", "title"], how='all')
metadata = metadata.fillna('')
metadata["abstract"] = metadata["abstract"].apply(preprocess)
metadata["title"] = metadata["title"].apply(preprocess)
METADATA = metadata
METADATA.to_pickle('metadata.p')

In [None]:
# topics.xml read and store
# Read the topics ( query, question annd narrative ) and store it in a dataframe
root = ET.parse('topics.xml').getroot()
topics = {}
for elem in root:
    topic = {}
    number = elem.attrib.get("number")
    children = elem.getchildren()
    topic["QUERY"]= children[0].text if children else None
    topic["QUESTION"]= children[1].text if children else None
    topic["NARRATIVE"]= children[2].text if children else None
    if number : 
        topics[number]= topic

TOPICS = pd.DataFrame.from_dict(topics, orient = "index")

# Preprocess the topics 
TOPICS = TOPICS.applymap(preprocess)

TOPICS.to_pickle("topics.pickle")


## Load Preprocessed Pickles

In [None]:
#Load the corresponding  pickles

with open('topics.pickle', 'rb') as pfile:
    TOPICS = pickle.load(pfile)

with open('metadata.p', 'rb') as pfile:
    METADATA = pickle.load(pfile)

#load word2vec model
wv = KeyedVectors.load("word2vec-all-c.wordvectors", mmap='r')

topic_scores = {}

## Query Expansion & BM25


In [None]:
#word2vec is used in order get similar meaning words which will be used in order to
#extend queries
def get_similars(querylist):
    similars = []
    for word in querylist:
        try:
            slist = wv.wv.most_similar(positive=[word])
            first = ""
            for s in slist:
                if s[0].casefold() != word:
                    first = s[0].casefold()
                    break
            similars.append(first)
        except:
            print(word + " not in vocab")

    return similars

In [None]:
extended = {}

for i,topic in enumerate(TOPICS.iterrows()):

  topic_num = i+1
  print(topic_num)
  extended[topic_num] = []

  #topic query
  tokenized_topic = topic[1]['QUERY'].split(" ")
  #Uncomment below if you want to apply query expansion
  #tokenized_topic.extend(get_similars(tokenized_topic)) # x2
  extended[topic_num].append(tokenized_topic)

  #topic question
  tokenized_topic = topic[1]['QUESTION'].split(" ")
  #Uncomment below if you want to apply query expansion 
  #tokenized_topic.extend(get_similars(tokenized_topic))
  extended[topic_num].append(tokenized_topic)

  #topic narrative
  tokenized_topic = topic[1]['NARRATIVE'].split(" ")
  #Uncomment below if you want to apply query expansion  
  #tokenized_topic.extend(get_similars(tokenized_topic))
  extended[topic_num].append(tokenized_topic)

In [None]:
#get bm-25 scores between title - query , question, narrative
for topic_num in extended.keys():

    print(topic_num)
    topic_scores[topic_num] = [] 

    #corpus metadata title
    corpus = METADATA['title']
    tokenized_corpus = corpus.apply(lambda x: x.split())
    bm25 = BM25Okapi(tokenized_corpus)

    #topic query
    query_title_scores = bm25.get_scores(extended[topic_num][0])
    topic_scores[topic_num].append(query_title_scores)

    #topic question
    question_title_scores = bm25.get_scores(extended[topic_num][1])
    topic_scores[topic_num].append(question_title_scores)

    #topic narrative
    narrative_title_scores = bm25.get_scores(extended[topic_num][2])
    topic_scores[topic_num].append(narrative_title_scores)

In [None]:
#get bm-25 scores between abstract - query , question, narrative
for topic_num in extended.keys():

    print(topic_num)

    #corpus metadata abstract

    corpus = METADATA['abstract']
    tokenized_corpus = corpus.apply(lambda x: x.split())
    bm25 = BM25Okapi(tokenized_corpus)

    #topic query
    query_abst_scores = bm25.get_scores(extended[topic_num][0])
    topic_scores[topic_num].append(query_abst_scores)

    #topic question
    question_abst_scores = bm25.get_scores(extended[topic_num][1])
    topic_scores[topic_num].append(question_abst_scores)

    #topic narrative
    narrative_abst_scores = bm25.get_scores(extended[topic_num][2])
    topic_scores[topic_num].append(narrative_abst_scores)


In [None]:
#sum up the scores ( weighted sum)
weighted_bm25_sums = []
# weights of the scores
weights = [0.12, 0.11, 0.11, 0.22, 0.22, 0.22]

for key in topic_scores:
  temp = [0]* 108620
  
  for i,ilist in enumerate(topic_scores[key]):
    temp+= weights[i] * ilist
  weighted_bm25_sums.append(temp)

## Top 50 Documents

In [None]:
#rank the documents according to bm-25 scores and get the top X for rerank with additional steps
topic_dict = {}
topic_50 = {}
topic_rest = {}

for topicnum in range(1,51):
  score_doc_dict = {}
  for i,doc in enumerate(METADATA['cord_uid']):
    score_doc_dict[doc] = weighted_bm25_sums[topicnum-1][i]
  
  topic_dict[topicnum] = sorted(score_doc_dict.items(), key=lambda x: x[1], reverse=True)
  topic_50[topicnum] = topic_dict[topicnum] [:50]
  topic_rest[topicnum] = topic_dict[topicnum] [50:]

## Keyword Extraction & BioBert Embedding & Cosine Similarity

In [None]:
#initialize keyword extractor and biobert embedding models
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
biobert = BiobertEmbedding()

In [None]:
#for the top X documents for every topic, use keyword extraction to kind of summarize the documents with N words (N= 100 in this case) 
# then append these words together in a sentence
#get the embedding for this sentence from keybert
#rank acoridng to cosine similarities

for topic_num in topic_50: #iterate each topic
  if int(topic_num) %2 == 0  : #only even topics are used for testing
    topic_top_embedding = [] 
    #join query - question and narrative of the topic
    topic_query = " ".join(extended[topic_num][0] + extended[topic_num][1] + extended[topic_num][2])
    # treat it as a sentence and get embedding
    topic_embedding = biobert.sentence_vector(topic_query).numpy()
    for doc_score in topic_50[topic_num]:
        #keyword extraction
        keyword_tuples = kw_extractor.extract_keywords( METADATA.loc[METADATA["cord_uid"] == doc_score[0], "abstract"].item(),keyphrase_ngram_range =(1,1),stop_words=None,top_n=100)
        keywords = [keytuple[0] for keytuple in keyword_tuples]
        #join keywords to form a sentence
        keyword_sentence = " ".join(keywords)
        #get embedding
        sentence_embedding = biobert.sentence_vector(keyword_sentence)
        topic_top_embedding.append(sentence_embedding.numpy())
  
    try: 
        #calculate cosine similarities for the newly calculated embeddings of the documents
        cosine_similarities = cosine_similarity([topic_embedding], topic_top_embedding).flatten()
        cosine_similarities = enumerate(cosine_similarities)
        cosine_similarities = sorted(cosine_similarities, key=operator.itemgetter(1), reverse= True)
        #rank
        sorted_doc_ids = [(topic_50[topic_num][index][0] , score)   for index, score in cosine_similarities]
        with open('top_doc_' + str(topic_num)+'.pickle', 'wb') as topicpickle:
            pickle.dump(sorted_doc_ids, topicpickle, pickle.HIGHEST_PROTOCOL)
    except:
        print(topic_num)

## Output to test file

In [None]:
#read similarity scores of first 50 documents
for i in range(2,51,2):
  with open('top_doc_' + str(i)+'.pickle', 'rb') as allpickle:
      top_d = pickle.load(allpickle) 
      top_50_sim[i]= top_d

#for all even topics, output first 50 document's scores, then the rest of the documents
with open('final.test', 'w') as outfile:
  for topic in topic_dict.keys():
    if int(topic) %2 == 0:
      for i,tup in enumerate(top_50_sim[topic]):
          #25 is added to sync cosine similarity results with BM scores. Max BM score is 25.
          s= str(topic) + " Q0 " + str(tup[0]) + " " + str(i+1) + " " + str(tup[1] + 25) + " GAM-run1" + "\n"
          outfile.write(s)
      for i,tup in enumerate(topic_rest[topic]):
          s= str(topic) + " Q0 " + str(tup[0]) + " " + str(i+101) + " " + str(tup[1]) + " GAM-run1" + "\n"
          outfile.write(s)

## Word2Vec Model Generator


In [None]:
#prepare data
texts =METADATA['abstract'].tolist()
texts.extend(METADATA['title'].tolist())
texts.extend(TOPICS["QUERY"].tolist())
texts.extend(TOPICS["NARRATIVE"].tolist())
texts.extend(TOPICS["QUESTION"].tolist())

In [None]:
#preprocess and split into tokens to be able to feed it into the model training
for ii in range(len(texts)):
    try:
        st = texts[ii]
        st = st.lower()
        sent = tokenize.sent_tokenize(st)
        if sent:
            sentences = [re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', 
                                repl='', 
                                string=x
                            ).strip().split(' ') for x in sent]
            sentences = [x for x in sentences if x != ['']]
            processed.append(sentences)
    except:
        print(ii)


In [None]:
#merge sentences together
all_sentences = []
for p in processed:
    all_sentences.extend(p)

In [None]:
#train word2vec model
model = Word2Vec(all_sentences, 
                 min_count=2,   # Ignore words that appear less than this
                 size=200,      # Dimensionality of word embeddings
                 workers=2,     # Number of processors (parallelisation)
                 window=6,      # Context window for words during training
                 iter=30)  

In [None]:
word_vectors = model.wv

In [None]:
#save word2vec model
word_vectors.save("word2vec-all-c.wordvectors")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c30274b2-cddd-4d19-919a-3ecdb6dd5b3f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>