In [1]:
import json
import re
from rank_bm25 import BM25Okapi
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [2]:
with open('news.article.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

for article in data:
    if 'articleBody' in article:
        article['content'] = clean_text(article['articleBody']).lower()

keywords = [
    'israel', 'hamas', 'gaza', 'palestine', 'terrorist', 'attack', 'air strikes', 
    'war', 'conflict', 'casualties', 'ceasefire', 'netanyahu', 'idf', 'rockets', 
    'iron dome', 'hezbollah', 'west bank', 'blockade', 'humanitarian', 'tragedy', 
    'disproportionate', 'occupation', 'militants', 'bombing']

israel_hamas_articles = [article for article in data if any(keyword in article.get('content', '') for keyword in keywords)]

print(f"Number of articles related to Israel-Hamas conflict: {len(israel_hamas_articles)}")

Number of articles related to Israel-Hamas conflict: 36347


In [3]:
tokenized_corpus = [doc['content'].split() for doc in israel_hamas_articles]
bm25 = BM25Okapi(tokenized_corpus)

In [4]:
def retrieve_documents(query, bm25, documents, k=5):
    tokenized_query = query.lower().split()
    doc_scores = bm25.get_scores(tokenized_query)
    top_k_indices = doc_scores.argsort()[-k:][::-1]
    return [documents[i] for i in top_k_indices]

In [5]:
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
def answer_question(question, documents, model, tokenizer):
    context = " ".join([doc['content'] for doc in documents])
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model.generate(input_ids)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

## Test Case 1

In [8]:
user_query = input("Please enter your question: ")

relevant_docs = retrieve_documents(user_query, bm25, israel_hamas_articles)

answer = answer_question(user_query, relevant_docs, model, tokenizer)
print(f"Answer: {answer}")

Please enter your question: What happened at the Al-Shifa Hospital?




Answer: a protester lit themselves on fire outside the israeli consulate in atlanta


## Test Case 2

In [9]:
user_query = input("Please enter your question: ")

relevant_docs = retrieve_documents(user_query, bm25, israel_hamas_articles)

answer = answer_question(user_query, relevant_docs, model, tokenizer)
print(f"Answer: {answer}")

Please enter your question: How many people have been killed and wounded in the recent conflict according to Gaza's health ministry?
Answer: hamas health ministry says the death toll in the territory since oct


## Test Case 3

In [10]:
user_query = input("Please enter your question: ")

relevant_docs = retrieve_documents(user_query, bm25, israel_hamas_articles)

answer = answer_question(user_query, relevant_docs, model, tokenizer)
print(f"Answer: {answer}")

Please enter your question: What did President Biden suggest might have motivated Hamas to conduct attacks on Israel?
Answer: israelsaudi arabia normalisation


## Test Case 4

In [11]:
user_query = input("Please enter your question: ")

relevant_docs = retrieve_documents(user_query, bm25, israel_hamas_articles)

answer = answer_question(user_query, relevant_docs, model, tokenizer)
print(f"Answer: {answer}")

Please enter your question: Who was Adam Samer al-Ghoul, and what happened to him in Jenin?
Answer: adam samer alghoul


## Test Case 5

In [12]:
user_query = input("Please enter your question: ")

relevant_docs = retrieve_documents(user_query, bm25, israel_hamas_articles)

answer = answer_question(user_query, relevant_docs, model, tokenizer)
print(f"Answer: {answer}")

Please enter your question: How many trucks were in the aid convoy to northern Gaza?
Answer: israeli military said it had facilitated the passage of the 31 trucks


## Questions

1. What happened at the Al-Shifa Hospital?
2. How many people have been killed and wounded in the recent conflict according to Gaza's health ministry?
3. What did President Biden suggest might have motivated Hamas to conduct attacks on Israel?
4. Who was Adam Samer al-Ghoul, and what happened to him in Jenin?
5. How many trucks were in the aid convoy to northern Gaza?