In [3]:
import json
import re
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the JSON file with specified encoding
file_path = 'news.article.json'

#/TR: Load the JSON file containing news articles
with open(file_path, 'r', encoding='utf-8') as file:
    #/TR: Load the JSON content into the 'articles' variable
    articles = json.load(file)

# Preprocess articles
def clean_article(article):
    #/TR: Remove special characters and noise from the article
    article = re.sub(r'\s+', ', article)  # Remove extra whitespace
    article = re.sub(r'[^\w\s]', '', article)  # Remove punctuation
    article = article.lower()  # Convert to lowercase
    return article

#/TR: Initialize an empty list to store cleaned articles
cleaned_articles = []
for article in tqdm(articles):
    #/TR: Access the article text using the 'articleBody' key
    article_text = article.get('articleBody', '')  
    #/TR: Apply basic filtering for relevant articles
    if 'israel' in article_text.lower() or 'hamas' in article_text.lower():  
        #/TR: Clean the article and add it to the list
        cleaned_articles.append(clean_article(article_text))

# Tokenize the articles for BM25
#/TR: Split each cleaned article into individual words
tokenized_articles = [article.split() for article in cleaned_articles]

# Initialize BM25
#/TR: Create a BM25Okapi instance for ranking articles
bm25 = BM25Okapi(tokenized_articles)

def retrieve_articles(query, k=5):
    #/TR: Tokenize the query and convert to lowercase
    tokenized_query = query.lower().split()
    #/TR: Retrieve the top 'k' relevant articles for the query
    results = bm25.get_top_n(tokenized_query, cleaned_articles, n=k)
    return results

# Load T5 model and tokenizer
model_name = 't5-small'  # You can choose a larger model for better performance
#/TR: Load the pre-trained T5 model for conditional generation
model = T5ForConditionalGeneration.from_pretrained(model_name)
#/TR: Load the pre-trained T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name)

def answer_question(question, articles):
    #/TR: Combine the articles into a single context string
    context = " ".join(articles)
    #/TR: Create the input text for the T5 model
    input_text = f"question: {question} context: {context}"
    #/TR: Tokenize the input text and prepare it for the T5 model
    input_ids = tokenizer.encode(input_text, return_tensors='pt', truncation=True)
    #/TR: Generate the answer using the T5 model
    outputs = model.generate(input_ids)
    #/TR: Decode the answer and remove special tokens
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

def answer_israel_hamas_question(question):
    #/TR: Retrieve relevant articles for the question
    retrieved_articles = retrieve_articles(question)
    #/TR: Answer the question using the retrieved articles
    answer = answer_question(question, retrieved_articles)
    return answer

# Repeatedly ask for questions and provide answers
while True:
    #/TR: Prompt the user to enter a question or type 'exit' to stop
    user_question = input("Please enter your question (or type 'exit' to stop): ")
    if user_question.lower() == 'exit':
        print("Exiting the question-answering system. Goodbye!")
        break
    #/TR: Answer the user's question
    answer = answer_israel_hamas_question(user_question)
    print("Answer:", answer)

100%|███████████████████████████████████████████████████████████████████████████| 37421/37421 [03:47<00:00, 164.51it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Please enter your question (or type 'exit' to stop):  What is happening in the hospital


Answer: protests took place in parts of tel aviv after israeli military announced that


Please enter your question (or type 'exit' to stop):  Exit


Exiting the question-answering system. Goodbye!
