## Assignment 2 
### Jiaxuan  a1876484

### 1. Reading dataset and pre-processing

In [66]:
import json
import os
import csv
import numpy as np
import pandas as pd
import random
import string
from collections import defaultdict
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import preprocess_string
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import bigrams
from nltk import pos_tag
from collections import Counter

def preprocess_text(text):
    """
    Preprocess the text input for both dataset articles and queries.

    text: Text in string.
    """
    lemmatizer = WordNetLemmatizer() # define lemmatizer
    stop_words = set(stopwords.words('english')) # use stop_words
    
    tokens = word_tokenize(text)
    tokens = [t.lower() for t in tokens if t.lower() not in stop_words] # eliminate the stopwords
    tokens = [t for t in tokens if t not in string.punctuation] # eliminate the stopwords
    tokens = [t for t in tokens if not t.isnumeric()] # eliminate the single numbers to clean the text, keep the words like COVID-19.
    tokens = [lemmatizer.lemmatize(t) for t in tokens] # lemmatize rest words
    return " ".join(tokens) # return a string after preprocess


def read_and_process_data(file_path, num_samples = 1000):
    """
    Randomly load and process the dataset and save in a dataframe in the end.

    file_path: Path to the folder containing the JSON files
    num_samples: Number of samples to read from the folder (default: 10000)
    """
    files = os.listdir(file_path) # get all the file names in the folder
    sample = random.sample(files, num_samples) # randomly pick files with the number of samples
    df_sample = pd.DataFrame(columns=['paper_id', 'paragraphs'])# create a dataframe to save the aritcle id and paragraphs 
    for file in sample: # read files, extract paper_id and texts in bodytext
        with open(file_path + file ,encoding = 'utf-8') as f:
            data = json.load(f)
        paperid = data['paper_id']
        bodytext = ''
        for text in data['body_text']:
            text = text['text']
            bodytext += text
        df_sample.loc[len(df_sample.index)] = [paperid , bodytext] # add a line with paper_id and all the bodytext in it.
    df_sample["paragraphs"] = df_sample["paragraphs"].apply(preprocess_text)# preprocess all the paragraphs.
    return df_sample
# Here we decided to use a dataframe to save the text as sentences, because we will use word2vec to transform them into word embeddings, 
# so the "distance" between them is very helpful and necessary

### 2. Named Entity Recognition and Knowledge Base

In [67]:
def create_knowledge_base(articles):
    """
    Use NER to create KB, extract all the NEs that are linked by 'also named', 'also called' and 'also known as'

    articles: Articles after pre-process
    """
    nlp = spacy.load("en_core_sci_sm") # more accurate extraction of NER using spcay's medical data
    kb = {}
    def add_to_kb(key: str, value: str): # define adding the detected words to the dictionary
        if key not in kb:
            kb[key] = [value]
        elif value not in kb[key]:
            kb[key].append(value)

    for _, paragraphs in articles:
        doc = nlp(paragraphs)
        for token in doc:
            # check for aliases using "also/sometines known as" pattern
            if token.lower_ in {"also", "sometimes"}:
                conj = token.head
                if conj.lower_ in {"known","called","named"}:
                    subject = conj.head
                    obj = [t for t in conj.rights if t.dep_ in {"dobj", "attr"}]
                    # add the word after lemmatizer and lower
                    add_to_kb(subject.lemma_.lower(), obj[0].lemma_.lower()) 

            # check for aliases using "or" or "cc"pattern
            if token.lower_ == "or" and token.dep_ == "cc":
                left = token.lefts
                right = token.rights
                for left_item in left:
                    for right_item in right:
                        # determine whether the left and right lexicals are consistent with pos_tag
                        if (left_item.dep_ in {"conj", "appos"} and
                            right_item.dep_ in {"conj", "appos"} and
                            left_item.pos_ in {"NOUN", "PROPN"} and
                            right_item.pos_ in {"NOUN", "PROPN"}):
                            # add the word after lemmatizer and lower
                            add_to_kb(left_item.lemma_.lower(), right_item.lemma_.lower())

    return kb

#Here I used the simpler method of finding conjunctions to find aliases, 
#but to be more precise I added a lexical determination so that only nouns would be added to the KB dictionary

### 3. Indexing method

In [63]:
# 3. Article embeddings
def average_word_vectors(words, model, vocabulary, num_features):
    """
    Calculate the average word vectors for a list of words.

    words: The input list of words
    model: Word2Vec model trained on the paragraphs
    vocabulary: Set of words in the model's vocabulary
    num_features: Size of the word embeddings same to the model size
    """
    feature_vector = np.zeros((num_features,), dtype="float32") # initialize an empty feature vector with the same size as the word embeddings
    n_words = 0 # initialize a counter for the number of words in the list that are in the model's vocabulary
    for word in words: # iterate through the words in the list
         
        if word in vocabulary: # if the word is in the model's vocabulary, update the feature vector and counter
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])
            
    # if there are any valid words in the list, divide the feature vector by the number of valid words
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

def create_article_embeddings(processed_data, embedding_size=200):
    """
    Create article embeddings using Word2Vec.

    processed_data: DataFrame containing the processed articles
    embedding_size: Size of the word embeddings (default: 200)
    """
    # train the Word2Vec model
    #model = Word2Vec([paragraph.split() for paragraph in processed_data["paragraphs"]], vector_size=embedding_size, min_count=3,window=5,workers=4)
    model = Word2Vec.load("word2vec.model")
    # compute the average embeddings for each article
    article_embeddings = [average_word_vectors(article.split(), model, set(model.wv.index_to_key), 200) for article in processed_data["paragraphs"]]
    #model.save("word2vec.model")
    return model, article_embeddings
# Here I use Inverted Index to implement article indexing, in average_word_vectors() we define the implementation of converting text into word vectors, 
# then generate the model and define the word vector output of all articles separately in create_article_embeddings() to compare later with word vecters of query words, 
# And I use the content of 20,000 files as the training content of my model, the size of the model is about 650MB.

### 4. Text matching utility

In [86]:
# 4. Text matching utility

    
def match_articles(query, model,processed_data, article_embeddings, kb, top_k=10):
    """
    Match articles based on the given query.

    query: The input query (question)
    model: Word2Vec model trained on the paragraphs
    processed_data: The dataframe of dataset to find the cord_uid
    article_embeddings: Precomputed article embeddings
    kb: Knowledge Base containing named entities and their associated terms
    top_k: Number of top matches to return (default: 3)
    """
    
    # preprocess the query and find related entities from the KB
    query = preprocess_text(query)
    query_tokens = word_tokenize(query) # tokenize words to get pos_tag and detect each if in kb
    query_tokens = pos_tag(query_tokens) # use pos_tag here to extract the "key words" including Noun, verb, adjective in the query
    query_entities = [word[0] for word in query_tokens if word[1] in ['NN','NNP','NNS','NNPS','JJ','JJR','JJS','VB','VBG','VBD','VBN','VBP','VBZ']]
    
    # find related entities from the KB for each token in the query
    for token in query_tokens:
        if token[0] in kb and token[1] in ['NN','NNP','NNS','NNPS']:# detect if the noun in the token is in the KB
            query_entities += kb[token[0]] # add the aliases into the query_entities
    
    # compute the query embedding after combined associated names in kb
    query_embedding = average_word_vectors(query_entities, model, set(model.wv.index_to_key), 200)
    
    # calculate the cosine similarity between the query embedding and the article embeddings
    cos_similarities  = cosine_similarity([query_embedding], article_embeddings)[0]
    
    # get the indices of the top_k most similar articles
    top_n_indices = np.argsort(cos_similarities)[-top_k:][::-1]  # Top 3 most similar articles
    
    # create a list of the top_k most similar articles with their similarity scores
    top_n_similarities = cos_similarities[top_n_indices]   
    
    # extract the snippet from the article
    snippet = []
    for index in top_n_indices:
        # read the file again to extract each part of body text
        file = "C:/Users/17820/Downloads/pmc_json/"+processed_data.iloc[index]['paper_id']+'.xml.json'
        with open(file ,encoding = 'utf-8') as f:
            data = json.load(f)
        paragraph = []
        for text in data['body_text']:
            text = text['text']
            sentences = preprocess_text(text) # reprocess each text in the articles to make them embedding later
            paragraph.append(sentences)
        
        # compute each part of text to snippet embedding
        snippet_embeddings = [average_word_vectors(sentences.split(), model, set(model.wv.index_to_key), 200) for sentences in paragraph]
        
        # calculate again the cosine similarity between the query embedding and the snippet embeddings
        cos_similarities_snippet = cosine_similarity([query_embedding], snippet_embeddings)[0]
        
        # use np.argsort() funtion to let each snippet in the article be sorted according to cos_similarities_snippet from largest to smallest
        snippet_index = np.argsort(cos_similarities_snippet)[-1:][::-1]
        
        # add the most related original version of snippet in each of three articles in the list
        snippet.append(data['body_text'][snippet_index[0]]['text'])
        
        #csv_path = 'C:/Users/17820/Downloads/metadata.csv'
        # prepare the three top articles' paper_ids for output
        paper_ids = [processed_data.iloc[index]['paper_id'] for index in top_n_indices]
        # prepare the three top articles' similarities for output
        similarities = [cos_similarities[index] for index in top_n_indices]
    return snippet,paper_ids,similarities 
    
def extract_answer(query, snippet):
    """
    Extract a specific answer from the snippet based on the question word.

    query: The input query (question)
    snippet: The snippet from the most relevant article
    """
    query_tags = pos_tag(word_tokenize(query))
    question_word = ""
    for word, pos in query_tags:
        if pos in ["WDT", "WP", "WP$", "WRB"]:
            question_word = word.lower()
            break
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(snippet)
    if question_word in ["who", "what", "which"]:
        for ent in doc.ents:
            if ent.label_ in ["PERSON", "ORG"]:
                return ent.text

    elif question_word == "when":
        for ent in doc.ents:
            if ent.label_ in ["DATE", "TIME"]:
                return ent.text

    elif question_word == "where":
        for ent in doc.ents:
            if ent.label_ in ["GPE", "LOC", "FAC"]:
                return ent.text

    elif question_word in ["how", "why"]:
        for sent in doc.sents:
            for token in sent:
                if token.dep_ == "ROOT":
                    return sent.text
    
    

#In the match_articles() function I combined the method of finding the article in the pd and the text in the article, this step is also the most important step of the whole system.
#First we extracted the noun, verb and adjective parts of the query, here I did not choose to use NER but use pos_tag because NER can not identify the key words in the question well, 
#it may be common nouns (face mask) or the most words (larggest), so extract them all can be better to get the semantics.
#Then for each keyword in the noun, I searched in the KB whether it has aliases and added to the tokens of the query, which is very helpful to improve the accuracy of the search and increase the weight of keywords.
#For similarity comparison I used cos_similarities to embedding the query and each atricle to get the three most matching articles, 
#and similarly for each paragraph in the atricle I used this method to find the most relevant snippet, snippet and their scores (similarity) as output

In [None]:
import textwrap
import torch
import pandas as pd
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
#device_available = torch.cuda.is_available()
device_available = False
from IPython.core.display import display, HTML
import seaborn as sns
import matplotlib.pyplot as plt

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
if device_available:
    model.cuda()

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

def get_dataset(csv_path):
    corpus = []
    csv_df = pd.read_csv(csv_path).dropna(subset=['authors', 'abstract']).drop_duplicates(subset='abstract')
    csv_df = csv_df[csv_df['abstract']!='Unknown']
    for ix,row in csv_df.iterrows():
        if row['abstract'] and not pd.isna(row['abstract']):
            temp_dict = dict()
            temp_dict['abstract'] = row['abstract']
            temp_dict['title'] = row['title']
            temp_dict['authors'] = row['authors']
            temp_dict['url'] = row['doi']
            temp_dict['publish_time'] = row['publish_time']
            corpus.append(temp_dict)
    return corpus

wrapper = textwrap.TextWrapper(width=80) 

corpus = get_dataset('https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/2020-03-27/metadata.csv')
def answer_question_dict(question, keyword=None, show_visualization=False):

    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # select corpus
    answer_text = corpus

    # Initializing answers list
    answers = {}
    min_score = 0
    counter = 0 # for stopping iterations earlier
    
    for answer_option in answer_text:
        if keyword and keyword not in answer_option['abstract']:
        continue

      # ======== Tokenize ========
      # Apply the tokenizer to the input text, treating them as a text-pair.
        input_ids = tokenizer.encode(question, answer_option['abstract'],max_length=512)

      # Report how long the input sequence is.
      #print('Query has {:,} tokens.\n'.format(len(input_ids)))

      # ======== Set Segment IDs ========
      # Search the input_ids for the first instance of the `[SEP]` token.
        sep_index = input_ids.index(tokenizer.sep_token_id)

      # The number of segment A tokens includes the [SEP] token istelf.
        num_seg_a = sep_index + 1

      # The remainder are segment B.
        num_seg_b = len(input_ids) - num_seg_a

      # Construct the list of 0s and 1s.
        segment_ids = [0]*num_seg_a + [1]*num_seg_b
    

      # There should be a segment_id for every input token.
        assert len(segment_ids) == len(input_ids)

      # ======== Evaluate ========
      # Run our example question through the model.
        
        input_ids_tensor = torch.tensor([input_ids])
        segment_ids_tensor = torch.tensor([segment_ids])
        if device_available:
            input_ids_tensor = input_ids_tensor.to('cuda:0')
            segment_ids_tensor = segment_ids_tensor.to('cuda:0')

        start_scores, end_scores = model(input_ids_tensor, # The tokens representing our input text.
                                  token_type_ids=segment_ids_tensor) # The segment IDs to differentiate question from answer_text
    
      # only review answers with score above threshold
        score = round(torch.max(start_scores).item(), 3)

        if score>min_score and score>0:

        # ======== Reconstruct Answer ========
        
        # Find the tokens with the highest `start` and `end` scores.
            answer_start = torch.argmax(start_scores)
            answer_end = torch.argmax(end_scores)


        # Get the string versions of the input tokens.
            tokens = tokenizer.convert_ids_to_tokens(input_ids)

        # Start with the first token.
            answer = tokens[answer_start]

        # Select the remaining answer tokens and join them with whitespace.
            for i in range(answer_start + 1, answer_end + 1):

                # If it's a subword token, then recombine it with the previous token.
                if tokens[i][0:2] == '##':
                    answer += tokens[i][2:]

                # Otherwise, add a space then the token.
                else:
                    answer += ' ' + tokens[i]

        # ======== Add Answer to best answers list ========

            if len(answers)>4:
                min_score = min([d for d in answers.keys()])

            if len(answers)==10:
                answers.pop(min_score)
            answers[score] = [answer, score, '<a href="https://doi.org/'+str(answer_option['url'])+'" target="_blank">' + str(answer_option['title']) +'</a>', answer_option['abstract'], answer_option['publish_time']]

            visualization_start = max(answer_start-20,0)
            visualization_end = min((answer_end+1)+20,len(tokens))
            # Variables needed for graphs
            s_scores = start_scores.cpu().detach().numpy().flatten()
            e_scores = end_scores.cpu().detach().numpy().flatten()

            # We'll use the tokens as the x-axis labels. In order to do that, they all need
            # to be unique, so we'll add the token index to the end of each one.
            token_labels = []
            for (i, token) in enumerate(tokens):
                token_labels.append('{:} - {:>2}'.format(token, i))
            answers[score] = [answer, score, '<a href="https://doi.org/'+str(answer_option['url'])+'" target="_blank">' + str(answer_option['title']) +'</a>', answer_option['abstract'], answer_option['publish_time'], s_scores, e_scores, token_labels, visualization_start, visualization_end]

    # Return dataframe with relevant data
    df_columns = ['Answer', 'Confidence', 'Title', 'Abstract', 'Published', 's_scores', 'e_scores', 'token_labels', 'visualization_start', 'visualization_end']
    df = pd.DataFrame.from_dict(answers, orient='index',columns = df_columns)
    df.sort_values(by=['Confidence'], inplace=True, ascending=False)
    return df

### 6. Simple user interface

In [87]:
def main():
    # Load data and preprocess
    dataset_folder = "C:/Users/17820/Downloads/pmc_json/"
    processed_data = read_and_process_data(dataset_folder)
    #kb = create_kb(processed_data)
    kb = {}
    model, article_embeddings = create_article_embeddings(processed_data)
    
    # I built a simple user input interface, so that you can freely select the question input and get the result until you type 'quit'
    while True:
        query = input("Enter your question (or 'quit' to exit): \n")
        if query.strip().lower() == 'quit':
            break

        snippet,paper_ids,similarities = match_articles(query, model, processed_data,article_embeddings, kb)
        answer = []
        for s in snippet:
            ans = extract_answer(query,s)
            answer.append(ans)
        answer = Counter(answer)
        answer = list(answer.keys())[0]
        print(f"answer is {answer}")
        print('------------------------------------------------------------------------------')

if __name__ == "__main__":
    main()
    
# For my custom questions, the answer system gave some more relevant answers, in terms of improving performance, 
#using larger data, more accurate data cleaning and using pre-trained models are expected to improve the accuracy and score

Enter your question (or 'quit' to exit): 
Where the first case of COVID-19 was found?
answer is China
------------------------------------------------------------------------------
Enter your question (or 'quit' to exit): 
When the first case of COVID-19 was found?
answer is 26 March 2020
------------------------------------------------------------------------------


KeyboardInterrupt: Interrupted by user

### 7. References

In [None]:
#TREC-COVID Qrels 1, Retrieved from https://ir.nist.gov/covidSubmit/data.html#cumu