# Assignment 2 
### Shan Shan Bianca Tan a1909709

## A. Information Retrieval system (1 person work)

## 1. Reading Datasets

In [3]:
# Importing libraries
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import spacy
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoModelForQuestionAnswering,pipeline
import string

#nltk.download('punkt')

### 1.1 Read Data

In [4]:
# Importing dataset
file = 'news_dataset.csv'
df = pd.read_csv(file, encoding = 'latin1')

In [5]:
# Function to reduce dataframe size
def reduce_df(df, sample_size=None):
    if sample_size:
        df = df.sample(n=sample_size, random_state=1)
    return df

In [6]:
rdf = reduce_df(df, 150) #  Take 150 samples

### 1.2 Reading question and answer dataset

In [7]:
test_qa = pd.read_csv('test_questions_and_answers.csv', encoding = 'latin1')

test_questions_and_answers = []
for row in test_qa.itertuples():
    # Format questions to be in necesssary format
    test_questions_and_answers.append((row[1], row[2], row[3]))

# Display top questions to ensure format is correct
test_questions_and_answers[:4]

[(17574, 'Who is the vice chairman of Samsung?', 'Jay Y. Lee'),
 (17298,
  'Which subway is opening in New York City on Sunday?',
  'Second Avenue subway'),
 (17339, 'What amount did Fox News offer?', '20 Million'),
 (17300, "Who is Mr. Roof's lead lawyer?", 'David I. Bruck')]

## 2. Data Pre-processing

Data pre-processing is done to ensure that the data fed to the model is of high quality. This ensures that the model can efficiently extract the relevant information.

In [9]:
# Code from (Germec, 2023)
# Function to preprocess the text for better results
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
def preprocess(text):
    # Lower casing
    text = text.lower()

    # Replacing "?" with blank
    text = text.replace("?", "")   
    
    # Lemmatization
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])


    return lemmatized_text

# Function to split an article into individual sentences
def split_article (article):
    test = article.replace("?", "")
    #   Split article into sentences
    sentences = test.split(".")
    return sentences

In [10]:
# Testing to ensure preprocess function is working
test = rdf.article.iloc[2]
print("Original: ", test[:1000]) # Print first 1000 characters to check
print()
print("Processed: ", preprocess(test)[:1000]) # Print first 1000 characters to check

Original:  One night nearly 140 years ago, Samuel Clemens told his young daughters Clara and Susie a bedtime story about a poor boy who eats a magic flower that gives him the ability to talk to animals. Storytelling was a nightly ritual in the Clemens home. But something about this particular tale must have stuck with Clemens, better known as Mark Twain, because he decided to jot down some notes about it. The story might have ended there, lost to history. But decades later, the scholar John Bird was searching the Twain archives at the University of California, Berkeley, when he came across the notes for the story, which Twain titled ?Oleomargarine. ? Mr. Bird was astonished to find a richly imagined fable, in Twain?s inimitable voice. He and other scholars believe it may be the only written remnant of a children?s fairy tale from Twain, though he told his daughters stories constantly. It?s impossible to know why Twain did not finish the tale, or if he ever intended it for a wider audie

## 3 Selection of Model

Section 3.1 will discuss the hybrid model while section 3.2 will discuss the direct usage of a pre-trained model.

### 3.1 Hybrid Model (Self built functions + Pre-trained Models)

#### 3.1.1 Coreference Resolution Utility

In [11]:
# Code from Workshop
nlp = spacy.load("en_core_web_sm")
def resolve_coreferences(sentence):
    doc = nlp(sentence)
    entity_mentions = {}

    # Iterate through named entities and store mentions
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "ORG", "GPE"]:
            entity_mentions[ent.root.text] = ent.text

    # Resolve pronouns to named entity mentions
    resolved_tokens = []
    corefs = {}  # Dictionary to store coreference relations

    # Iterate over each word in the sentence
    for i, token in enumerate(doc):
        # If the word is a pronoun
        if token.pos_ == 'PRON' and token.text.lower() in ["he", "him", "she", "her", "it", "they", "them","his"]:
            # Iterate over each token before the pronoun
            for j in range(i - 1, -1, -1):
                # If the token is a noun or a proper noun (part of named entity), it is a possible antecedent
                if doc[j].pos_ in ['NOUN', 'PROPN']:
                    # Save the antecedent as the coreference of the pronoun
                    corefs[i] = j
                    resolved_tokens.append(entity_mentions.get(doc[j].text, doc[j].text))
                    resolved_tokens.append(' ')
                    break
            else:
                # If no antecedent found, keep the pronoun as is
                resolved_tokens.append(token.text_with_ws)
        else:
            resolved_tokens.append(token.text_with_ws)

    # Join resolved tokens to form the resolved sentence
    resolved_sentence = ''.join(resolved_tokens)
    return resolved_sentence

# Test Coreference Resolution on a simple sentence to ensure it works
sentence = "Jessica is sick and he did not go to school."
resolved_sentence = resolve_coreferences(sentence)
print("Original sentence:", sentence)
print("Resolved sentence:", resolved_sentence)

Original sentence: Jessica is sick and he did not go to school.
Resolved sentence: Jessica is sick and Jessica did not go to school.


#### 3.1.2 Extract Entities

In [2]:
# Using pretrain model to obtain the most relevant answer from the phrase
# In appendix, I have included attempts of using self-written functions for extraction of entities 
# but it does not provide accurate results
# Code from (Chan et al. 2024) 
from transformers import AutoTokenizer, AutoModelForQuestionAnswering,pipeline
model_name = "deepset/tinyroberta-squad2"

def extract_most_relevant_entity(question, relevant_sentence):
    nlp_extraction = pipeline('question-answering', model=model_name, tokenizer=model_name)

    QA_input = {
    'question': question,
    'context': sentence}
    
    return nlp_extraction(QA_input)

# Test extraction of relevant answer on a simple sentence to ensure it works
question = 'Who fell down?'
sentence = "Jack fell down and John helped him."
extract_most_relevant_entity(question, sentence)

{'score': 0.9785180687904358, 'start': 0, 'end': 4, 'answer': 'Jack'}

#### 3.1.3 Text matching utility
- Find most relevant sentence and its confidence score in the article based on the user question

In [12]:
# Code from Workshop
# Text matching utility
# Find most relevant sentence in article through computing similarity score between question and sentence
def find_most_relevant_sentence(user_question, article_sentences):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(article_sentences + [user_question])
    
    # Calculate cosine similarity between the question and each sentence
    cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
    most_similar_sentence_index = cosine_similarities.argmax()
    
    return article_sentences[most_similar_sentence_index], cosine_similarities[most_similar_sentence_index]

# To apply find_most_relevant_sentence
def get_answer_with_relevant_sentence(sentences, question):
    # Find the most relevant sentence
    most_relevant_sentence, similarity_score = find_most_relevant_sentence(question, sentences)
    return similarity_score,most_relevant_sentence

#### 3.1.4 Test utility

In [13]:
# Test text matching utility
# Code by self
predicted_answers = []
for i in range(0, len(test_questions_and_answers)):
    #   Get article id
    testid = test_questions_and_answers[i][0]

    #   Get sentences
    for row in df.itertuples():
        if row.id == testid:
            current_article = preprocess(row.article)
            #print(current_article)
            current_article = resolve_coreferences(current_article)
            sentences = split_article(current_article)
    # Obtain the most relevant sentence
    score, sentence = get_answer_with_relevant_sentence(sentences, test_questions_and_answers[i][1])
    print(test_questions_and_answers[i][1], test_questions_and_answers[i][2])

    # Obtain the most relevant entity
    answer = extract_most_relevant_entity(test_questions_and_answers[i][1], sentence)
    predicted_answers.append(answer['answer'])
    print(answer)
    print()

Who is the vice chairman of Samsung? Jay Y. Lee
{'score': 0.952017605304718, 'start': 1, 'end': 4, 'answer': 'lee'}

Which subway is opening in New York City on Sunday? Second Avenue subway
{'score': 0.5400621891021729, 'start': 5, 'end': 25, 'answer': 'second avenue subway'}

What amount did Fox News offer? 20 Million
{'score': 0.9534481763839722, 'start': 81, 'end': 93, 'answer': '$ 20 million'}

Who is Mr. Roof's lead lawyer? David I. Bruck
{'score': 0.917976438999176, 'start': 21, 'end': 28, 'answer': 'david i'}

Who is the spokesman? Numan Kurtulmus
{'score': 9.806988998661836e-09, 'start': 307, 'end': 316, 'answer': 'caliphate'}

Where is the gunman from? Kyrgyzstan or elsewhere in Central Asia
{'score': 9.353278151991162e-09, 'start': 153, 'end': 166, 'answer': 'islamic state'}

Where is Megyn Kelly moving to from Fox News? NBC
{'score': 4.120644714333821e-09, 'start': 59, 'end': 67, 'answer': 'fox news'}

What salary was Megyn Kelly offered by the Murdoch family? More than $20 

#### 3.1.5 Evaluation

In [14]:
# Obtaining list of true answers
true_answers = [i[2] for i in test_questions_and_answers]
true_answers

['Jay Y. Lee',
 'Second Avenue subway',
 '20 Million',
 'David I. Bruck',
 'Numan Kurtulmus',
 'Kyrgyzstan or elsewhere in Central Asia',
 'NBC',
 'More than $20 million a year',
 '2,800 students',
 "People's Daily",
 '83rd',
 'sharks',
 "Amazon, Apple, Facebook, Microsoft, and Alphabet (Google's parent company)",
 'Engagement ring',
 'San Francisco',
 'Wall Street lawyer',
 'A bright yellow hard hat',
 'Canada']

In [15]:
# Code by self
# Function to compute exact match score
def calculate_exact_match(predicted_answers, true_answers):
    # Initialize the EM score
    em_score = 0.0
    
    # Iterate through each pair of predicted and true answers
    for pred, true in zip(predicted_answers, true_answers):
        # Check if the predicted answer exactly matches the true answer (lowercasing of both)
        if pred.lower() == true.lower():
            em_score += 1
    
    # Calculate the EM score as the proportion of exact matches
    em_score /= len(predicted_answers)
    
    return em_score

# Call calculate_exact_match function 
em_score = calculate_exact_match(predicted_answers, true_answers)
print("Exact Match (EM) Score:", round(em_score,2))

Exact Match (EM) Score: 0.17


### 3.2 Pre-train Model

In comparison to the model above, it can be seen that the answers from the pre trained model makes much more sense.

In [16]:
# Code from (Chan et al. 2024) 
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/tinyroberta-squad2"

# Testing on preprocessed test
nlp_qa = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Who are the daughters?',
    'context':  preprocess(test)}
res = nlp_qa(QA_input)
res

{'score': 0.991371750831604,
 'start': 70,
 'end': 85,
 'answer': 'clara and susie'}

In [17]:
# Code from (Chan et al. 2024) 
def answer_questions(test_questions_and_answers, df, k=5, print_output = 'N'):
    # Pre-trained Model
    # test_questions_and_answers (list): List of tuples containing test question IDs, questions, and expected answers
    # df (DataFrame): DataFrame containing articles and their corresponding IDs
    # k (int): Number of top answers to retrieve
    # print_output (str): Whether to print the output (default is 'N')

    model_name = "deepset/tinyroberta-squad2"
    nlp_qa = pipeline('question-answering', model=model_name, tokenizer=model_name)
    predicted_answers = []
    for i in range(len(test_questions_and_answers)):
        # Get article id
        testid = test_questions_and_answers[i][0]

        # Get context
        for row in df.itertuples():
            if row.id == testid:
                context = row.article

        # Preprocess text (Cleaning/Lemmatization/Lowercasing)
        context = preprocess(context)

        # If print_output == 'Y', print question and expected answer
        qn = test_questions_and_answers[i][1]
        if print_output == 'Y':
            print("Question:", qn)
        expected_ans = test_questions_and_answers[i][2]
        if print_output == 'Y':
            print('Expected answer:     ', expected_ans, '\n')

        qna = {
            'question': qn,
            'context': context}

        # Answer the question
        answers = nlp_qa(qna, topk=k)

        if print_output == 'Y':
        # Print the top 5 answers and their confidence scores
            print("Top 5 Answers and Confidence:")
            print("1:", answers[0]['answer'], round(answers[0]['score'], 4))
            counter = 1
            print("-------------------------------")
            print("Other Answers:")
            for answer in answers:
                if counter != 1:
                    print(f"{counter}:", answer['answer'], round(answer['score'], 4))
                counter += 1
            print()
        
        predicted_answers.append(answers[0]['answer'])
    return predicted_answers  # List of predicted answers
    
# Call the function
predicted_answers = answer_questions(test_questions_and_answers, df)



In [18]:
# Call calculate_exact_match function 
em_score = calculate_exact_match(predicted_answers, true_answers)
print("Exact Match (EM) Score:", em_score)

Exact Match (EM) Score: 0.5


## 4 Evaluation of selected model

I chose to use the pre-trained model as the exact match score is 0.5 as compared to the 0.17 produced by the hybrid model. This indicates that the pre-trained model produced results that are much more accurate.

### 4.1 MRR and MAP

In [19]:
# Code from ChatGPT 3.5 (OpenAI, 2022)
    # Prompt: Write me a function to obtain MRR and MAP from my model?
    # Relevant edits were made to suit my needs
def calculate_mrr_and_map(test_questions_and_answers, df, top_k=5):
    total_questions = len(test_questions_and_answers)
    total_mrr = 0
    total_map = 0

    for i in range(total_questions):
        # Get the ground truth answer
        ground_truth_answer = test_questions_and_answers[i][2]

        # Get the top-K answers from the model
        qna_input = {
            'question': test_questions_and_answers[i][1],
            'context': preprocess(df.loc[df['id'] == test_questions_and_answers[i][0]]['article'].values[0])
        }
        answers = nlp_qa(qna_input, topk=top_k)

        # Compute MRR
        reciprocal_rank = 0
        for rank, answer in enumerate(answers, 1):
            if answer['answer'].lower() == ground_truth_answer.lower():
                reciprocal_rank = 1 / rank
                break
        total_mrr += reciprocal_rank

        # Compute MAP
        precision_at_k = 0
        relevant_answers = 0
        for rank, answer in enumerate(answers, 1):
            if answer['answer'].lower() == ground_truth_answer.lower():
                relevant_answers += 1
                precision_at_k += relevant_answers / rank
        average_precision = precision_at_k / min(top_k, relevant_answers) if relevant_answers > 0 else 0
        total_map += average_precision

    mrr = round(total_mrr / total_questions,2)
    map_score = round(total_map / total_questions,2)

    return mrr, map_score

# Call the function
mrr_score, map_score = calculate_mrr_and_map(test_questions_and_answers, df)
print("Mean Reciprocal Rank (MRR):", mrr_score)
print("Mean Average Precision (MAP):", map_score)

Mean Reciprocal Rank (MRR): 0.59
Mean Average Precision (MAP): 0.6


### 4.2 User interaction with the system

In [24]:
def answer_question_with_top_5(question, article_num, df, expected_answer = ''):
    # Answer the question using the provided function
    expected_answer = 'No expected answers'
    answer = answer_questions([(article_num, question, expected_answer)], df, 5, "Y")

question = str(input("Please enter your quesiton:"))
article_num = int(input("Please enter your article number:"))

answer_question_with_top_5(question, article_num, df)

Please enter your quesiton: Who is the vice chairman of Samsung?
Please enter your article number: 17574


Question: Who is the vice chairman of Samsung?
Expected answer:      No expected answers 

Top 5 Answers and Confidence:
1: jay y. lee 0.9688
-------------------------------
Other Answers:
2: jay y. lee , 0.024
3: the de facto leader , jay y. lee 0.002
4: lee 0.0009
5: , jay y. lee 0.0007



### 4.2 Print results of 10 test questions

In [23]:
# Call the function
answer_questions(test_questions_and_answers[:10], df, 5, 'Y')

Question: Who is the vice chairman of Samsung?
Expected answer:      Jay Y. Lee 

Top 5 Answers and Confidence:
1: jay y. lee 0.9688
-------------------------------
Other Answers:
2: jay y. lee , 0.024
3: the de facto leader , jay y. lee 0.002
4: lee 0.0009
5: , jay y. lee 0.0007

Question: Which subway is opening in New York City on Sunday?
Expected answer:      Second Avenue subway 

Top 5 Answers and Confidence:
1: second avenue subway 0.4899
-------------------------------
Other Answers:
2: second avenue 0.2715
3: the second avenue subway 0.2158
4: second avenue 0.1626
5: the second avenue 0.0716

Question: What amount did Fox News offer?
Expected answer:      20 Million 

Top 5 Answers and Confidence:
1: $ 20 million 0.8697
-------------------------------
Other Answers:
2: $ 20 million 0.8697
3: 20 million 0.1175
4: 20 million 0.1149
5: $ 20 million offer 0.0071

Question: Who is Mr. Roof's lead lawyer?
Expected answer:      David I. Bruck 

Top 5 Answers and Confidence:
1: david 

['jay y. lee',
 'second avenue subway',
 '$ 20 million',
 'david i. bruck',
 'numan kurtulmus',
 'kyrgyzstan',
 'nbc',
 '$ 20 million a year',
 '800',
 'people daily']

## B. References

1. Chan, Branden et al. (Mar. 2024). deepset/tinyroberta-squad2. https://huggingface.co/deepset/tinyroberta-squad2.
2. Germec, M., PhD (2023) Text preprocessing with Natural Language Processing (NLP). https://www.linkedin.com/pulse/text-preprocessing-natural-language-processing-nlp-germec-phd/.
3. OpenAI, 2022, ChatGPT, April 16, 2024, https://chat.openai.com.

## C. Appendix

### C1 Building Question Set

In [124]:
# Define the questions and answers
# List of 100 questions for testing
test_questions_and_answers = [
    (17574, "Who is the vice chairman of Samsung?", "Jay Y. Lee"),
    (17298, "Which subway is opening in New York City on Sunday?", "Second Avenue subway"),
    (17339, 'What amount did Fox News offer?', '20 Million'),
    (17300, 'Who is Mr. Roof\'s lead lawyer?', 'David I. Bruck'),
    (17314, 'Who is the spokesman?', 'Numan Kurtulmus'),
    (17314, 'Where is the gunman from?', 'Kyrgyzstan or elsewhere in Central Asia'),
    (17339, 'Where is Megyn Kelly moving to from Fox News?', 'NBC'),
    (17339, 'What salary was Megyn Kelly offered by the Murdoch family?', 'More than $20 million a year'),
    (17341, 'How many students attend the Evergrande Football School?', '2,800 students'),
    (17341, 'What is the main newspaper of the Communist Party in China?', 'People\'s Daily'),
    (17341, 'What rank did the national men\'s soccer team of China recently achieve in FIFA rankings?', '83rd'),
    (17342, 'What is the term used to describe the biggest players in the technology industry?', 'sharks'),
    (17342, 'Who are the "Frightful Five" mentioned in the article?', 'Amazon, Apple, Facebook, Microsoft, and Alphabet (Google\'s parent company)'),
    (17347, "What special item did Mr. Purcell surprise Ms. Bui with?", "Engagement ring"),
    (17347, 'Which city did Mr. Purcell and Ms. Bui get engaged in?', 'San Francisco'),
    (17354, "What is Walter J. Clayton's background?", "Wall Street lawyer"),
    (17360, "What accessory did the author's mother, keep in her car?", "A bright yellow hard hat"),
    (17361, "What country was chosen as the top destination for the 2017 list?", "Canada"),
    # Add more test questions and answers here
]

In [131]:
pd.set_option('display.max_colwidth', 10000)
#df[df['id'] == 17346]['article']

#target_id = 17361  # Change this to the desired ID
# Filter the DataFrame to select the article with the matching ID
#selected_article = df[df['id'] == target_id]['article'].values[0]
# Print the selected article
#print(preprocess(selected_article))
#print("Processed: ", preprocess(test)[:1000]) # Print first 1000 characters to check

In [129]:
import csv
# Define the filename
csv_filename = "test_questions_and_answers.csv"

# Write the data to a CSV file
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["Article ID", "Question", "Expected Answer"])
    # Write each row of data
    writer.writerows(test_questions_and_answers)

### C2 Extracting Entities Method
- This method were not as effective as the method I used above

In [None]:
# Function to extract named entities from a sentence
def extract_entities(sentence):
    doc = nlp(sentence)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Function to extract the correct answer from the sentence
def extract_answer(sentence):
    entities = extract_entities(sentence)
    relevant_entities = [entity for entity, label in entities if label in ['PERSON', 'ORG', 'DATE', 'MONEY']]
    #print(relevant_entities)
    answer = ' '.join(relevant_entities)
    return answer