In [22]:
import pandas as pd
import re
# Read the dataset
news_df = pd.read_csv('news_dataset.csv', encoding='latin1')

# Data cleaning and preprocessing
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = text.lower()  # Convert to lowercase
    return text

news_df['cleaned_article'] = news_df['article'].apply(preprocess_text)

In [23]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# Load the pre-trained QA model and tokenizer
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
import torch

def answer_question(question, article):
    # Encode the question and article
    encoded_input = tokenizer(question, article, truncation=True, padding=True, return_tensors="pt")
    # Get the model output
    output = model(**encoded_input)
    # Extract the answer and confidence scores
    answer_start = output.start_logits.argmax()
    answer_end = output.end_logits.argmax()
    answer = tokenizer.decode(encoded_input["input_ids"][0][answer_start:answer_end+1])
    # Set confidence threshold
    confidence_threshold = 0.5
    # Calculate the confidence score
    start_prob = torch.softmax(output.start_logits, dim=1)[0][answer_start].item()
    end_prob = torch.softmax(output.end_logits, dim=1)[0][answer_end].item()
    confidence_score = (start_prob + end_prob) / 2
    if confidence_score >= confidence_threshold:
        return answer
    else:
        return "No answer found."

In [27]:
# Example test
question = "Who is the vice chairman of Samsung"
article_id = 17307
article = news_df[news_df['id'] == article_id]['cleaned_article'].iloc[0]
answer = answer_question(question, article)
print(f"Question: {question}")
print(f"Article ID: {article_id}")
print(f"Answer: {answer}")

Question: Who is the vice chairman of Samsung
Article ID: 17307
Answer: No answer found.


In [31]:
# Read test questions and answers
test_data = []
with open('test_questions_students_contributed.txt', 'r') as file:
    for line in file:
        match = re.search(r"\('(.*?)', (\d+)\) \('(.*?)', \d+\)", line)
        if match:
            question = match.group(1)
            article_id = int(match.group(2))
            answer = match.group(3)
            test_data.append((question, article_id, answer))

# Evaluate the QA system
correct_answers = 0
for question, article_id, true_answer in test_data:
    article = news_df[news_df['id'] == article_id]['cleaned_article'].iloc[0]
    predicted_answer = answer_question(question, article)
    # Print question, predicted answer, and true answer
    print(f"Question: {question}")
    print(f"Predicted Answer: {predicted_answer}")
    print(f"True Answer: {true_answer}")
    if predicted_answer.lower() == true_answer.lower():
        correct_answers += 1
        print("Result: Correct\n")
    else:
        print("Result: Incorrect\n")

accuracy = correct_answers / len(test_data)
print(f"Accuracy: {accuracy:.2f}")

Question: Who was the President during the conflict?
Predicted Answer: No answer found.
True Answer: George W. Bush
Result: Incorrect

Question: Who is the Senator of Colorado?
Predicted Answer: No answer found.
True Answer: Cory Gardner
Result: Incorrect

Question: What was the revolt?
Predicted Answer: No answer found.
True Answer: Tea Party Revolt
Result: Incorrect

Question: When did they get control back?
Predicted Answer: 2006
True Answer: 2010
Result: Incorrect

Question: Where is the senior Republican from?
Predicted Answer: oklahoma
True Answer: Oklahoma
Result: Correct

Question: Who was the president during the Iraq War?
Predicted Answer: No answer found.
True Answer: George W. Bush
Result: Incorrect

Question: What amount did Fox News offer?
Predicted Answer: No answer found.
True Answer: 20 Million
Result: Incorrect

Question: Where did Charlie Rose interview Kelly?
Predicted Answer: No answer found.
True Answer: CBS Sunday Morning
Result: Incorrect

Question: When did And