In [1]:
import json
import re
from rank_bm25 import BM25Okapi
from transformers import pipeline
import torch

In [2]:
with open('news.article.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

for article in data:
    if 'articleBody' in article:
        article['content'] = clean_text(article['articleBody']).lower()

keywords = [
    'israel', 'hamas', 'gaza', 'palestine', 'terrorist', 'attack', 'air strikes', 
    'war', 'conflict', 'casualties', 'ceasefire', 'netanyahu', 'idf', 'rockets', 
    'iron dome', 'hezbollah', 'west bank', 'blockade', 'humanitarian', 'tragedy', 
    'disproportionate', 'occupation', 'militants', 'bombing']

israel_hamas_articles = [article for article in data if any(keyword in article.get('content', '') for keyword in keywords)]
print(f"Number of articles related to Israel-Hamas conflict: {len(israel_hamas_articles)}")

Number of articles related to Israel-Hamas conflict: 36347


In [3]:
tokenized_corpus = [doc['content'].split() for doc in israel_hamas_articles]
bm25 = BM25Okapi(tokenized_corpus)

In [4]:
from transformers import BertForQuestionAnswering, BertTokenizer
model_name = "bert-base-uncased"
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_out

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

In [5]:
def answer_question(question, documents, tokenizer, model):
    max_len = tokenizer.model_max_length  # usually 512 for DistilBERT
    best_answer = ""
    best_score = float('-inf')
    
    for doc in documents:
        context = doc['content']
        # Split the context into smaller chunks
        inputs = tokenizer(question, context, return_tensors="pt", truncation="only_second", max_length=max_len, stride=50, return_overflowing_tokens=True)
        
        for i in range(inputs.input_ids.shape[0]):
            input_ids = inputs.input_ids[i]
            attention_mask = inputs.attention_mask[i]
            
            # Run the model on each chunk
            outputs = model(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            answer_start = torch.argmax(start_logits)
            answer_end = torch.argmax(end_logits) + 1
            score = start_logits[0, answer_start].item() + end_logits[0, answer_end-1].item()
            answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
            
            if score > best_score:
                best_score = score
                best_answer = answer
                
    return best_answer

In [6]:
def retrieve_documents(query, bm25, documents, k=5):
    tokenized_query = query.lower().split()
    doc_scores = bm25.get_scores(tokenized_query)
    top_k_indices = doc_scores.argsort()[-k:][::-1]
    return [documents[i] for i in top_k_indices]

## Test Case 1

In [7]:
# Retrieve documents using BM25
user_query = input("Please enter your question: ")
relevant_docs = retrieve_documents(user_query, bm25, israel_hamas_articles)

# Answer the question using the retrieved documents
answer = answer_question(user_query, relevant_docs, tokenizer, model)
print(f"Answer: {answer}")

Please enter your question: What happened at the Al-Shifa Hospital?
Answer: confirm if it was wrapped around the person when they ignited themselves the protester is in critical condition at a nearby hospital and the atlanta


## Test Case 2

In [8]:
# Retrieve documents using BM25
user_query = input("Please enter your question: ")
relevant_docs = retrieve_documents(user_query, bm25, israel_hamas_articles)

# Answer the question using the retrieved documents
answer = answer_question(user_query, relevant_docs, tokenizer, model)
print(f"Answer: {answer}")

Please enter your question: How many people have been killed and wounded in the recent conflict according to Gaza's health ministry?
Answer: 15000 israel hamas war the ceasefire has ended and israel has now launched airstrikes on gaza killing several people the death toll in palestine has risen above fifteen thousand by ananya srivastava advertisement israel hamas war representative image new delhi israel and palestine are two nations that have been against each other in terms of diplomatic relations and the israel palestine conflict has been going on for years now on october 6 2023 the palestinian islamist organisation hamas fired 5000 rockets from the gaza strip on israel and since then the conflict turned into a fullfledged war the israel hamas war has now being going on for almost two months killing thousands of people on both sides and so many have been kidnapped held hostages a truce between the two nations was declared recently and this temporary ceasefire lasted for close to a 

## Test Case 3

In [9]:
# Retrieve documents using BM25
user_query = input("Please enter your question: ")
relevant_docs = retrieve_documents(user_query, bm25, israel_hamas_articles)

# Answer the question using the retrieved documents
answer = answer_question(user_query, relevant_docs, tokenizer, model)
print(f"Answer: {answer}")

Please enter your question: What did President Biden suggest might have motivated Hamas to conduct attacks on Israel?
Answer: motivated hamas to conduct those attacks white house national security council nsc spokesman john kirby was responding to a question on president biden hinting that one of the reasons behind hamas october 7 attack on israel was the recent announcement of the indiamiddle easteurope economic corridor that integrates the entire region with a network of rail road and ports kirby


## Test Case 4

In [10]:
# Retrieve documents using BM25
user_query = input("Please enter your question: ")
relevant_docs = retrieve_documents(user_query, bm25, israel_hamas_articles)

# Answer the question using the retrieved documents
answer = answer_question(user_query, relevant_docs, tokenizer, model)
print(f"Answer: {answer}")

Please enter your question: Who was Adam Samer al-Ghoul, and what happened to him in Jenin?
Answer: believe he was a doctor so they kept him away from his son but suleiman knew instantly from the first sight of basil i knew that he was a martyr praise be to god basil and adam young boys playing in jenin were shot dead by israeli soldiers during the jenin raid in which two adults were also killed a video that captures the boys being shot has since gone viral the israeli army arrested 15 others from the refugee camp which has been a central focus of battles between them and palestinian resistance fighters the boys were among more than 260 palestinians in the occupied west bank who have been killed by israeli forces or settlers since the hamas attack on southern israel on october 7 israeli


## Test Case 4

In [11]:
# Retrieve documents using BM25
user_query = input("Please enter your question: ")
relevant_docs = retrieve_documents(user_query, bm25, israel_hamas_articles)

# Answer the question using the retrieved documents
answer = answer_question(user_query, relevant_docs, tokenizer, model)
print(f"Answer: {answer}")

Please enter your question: How many trucks were in the aid convoy to northern Gaza?
Answer: 150 according to the gaza strips health ministry which is managed by hamas the investigation discovered that while civilians were waiting for the relief vehicles armed palestinians opened fire one hour before the convoy reached an idfestablished corridor image courtesy afp the israeli army denied accusations by the health ministry of the hamasruled enclave that israeli soldiers were to blame saying on friday that armed palestinians opened fire on people in northern gaza who were waiting for humanitarian relief armed palestinians opened fire while gazan civilians were awaiting the arrival of the aid convoy in gaza city on thursday and then continued to shoot as the crowd of gazans began looting the


## Questions

1. What happened at the Al-Shifa Hospital?
2. How many people have been killed and wounded in the recent conflict according to Gaza's health ministry?
3. What did President Biden suggest might have motivated Hamas to conduct attacks on Israel?
4. Who was Adam Samer al-Ghoul, and what happened to him in Jenin?
5. How many trucks were in the aid convoy to northern Gaza?