#Akash Approach using roberta
#Q.1) Create a rudimentary Question Answering system that can answer questions related to the Israel Hamas war.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import json

with open('/content/drive/MyDrive/ML_Drooid/news.article.json', 'r') as f:
    data = json.load(f)


In [4]:
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    return text.strip()

for article in data:
    if 'content' in article:
        article['content'] = clean_text(article['content'])


In [5]:
keywords = ['Israel', 'Hamas', 'Gaza', 'war', 'conflict']

def is_relevant(article):
    if 'content' in article:
        return any(keyword.lower() in article['content'].lower() for keyword in keywords)
    return False

filtered_articles = [article for article in data if is_relevant(article)]


In [6]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [7]:
def answer_question(question, articles):
    answers = []
    for article in articles:
        answer = qa_pipeline(question=question, context=article['content'])
        answers.append((answer['answer'], answer['score'], article['source']))

    # Sort answers by score
    answers = sorted(answers, key=lambda x: x[1], reverse=True)
    return answers[0]  # Return the best answer


In [8]:
!pip install wikipedia-api


Collecting wikipedia-api
  Downloading Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.6.0


In [9]:
import wikipediaapi

# Define a user agent string
user_agent = "YourAppName/1.0 (https://yourwebsite.com; your-email@example.com)"

# Initialize the Wikipedia object with the user agent
headers = {
    'User-Agent': user_agent
}
wiki_wiki = wikipediaapi.Wikipedia('en', headers=headers)

def fetch_wikipedia_summary(topic):
    page = wiki_wiki.page(topic)
    if page.exists():
        return page.summary
    return None

# Fetch additional information
additional_info = fetch_wikipedia_summary('2023 Israel-Hamas war')

print(additional_info)


An armed conflict between Israel and Hamas-led Palestinian militant groups has been taking place chiefly in the Gaza Strip since 7 October 2023. Clashes have also occurred in the Israeli-occupied West Bank and with Hezbollah along the Israel–Lebanon–Golan Heights border. The fifth war of the Gaza–Israel conflict since 2008, it is part of the broader Israeli–Palestinian conflict, and the most significant military engagement in the region since the Yom Kippur War 50 years earlier. The war is the deadliest conflict for the Palestinians in the entirety of the Arab-Israeli conflict.
The war began when Hamas-led militant groups launched a surprise attack on Israel on 7 October. An estimated 3,000 militants breached the Gaza–Israel barrier and attacked Israeli civilian communities and military bases. Several thousand rockets were concurrently launched into Israel. During this attack, 1,139 Israelis and foreign nationals were killed, including 766 civilians and 373 security personnel. In addit

In [10]:
def answer_question_with_augmented_info(question, articles, additional_info):
    # Combine articles content with additional information
    combined_articles = articles + [{'content': additional_info, 'source': 'Wikipedia'}]
    return answer_question(question, combined_articles)


In [11]:
question = "What happened at the Al-Shifa Hospital?"
best_answer = answer_question_with_augmented_info(question, filtered_articles, additional_info)
print(f"Answer: {best_answer[0]} (Source: {best_answer[2]})")


Answer: Large protests (Source: Wikipedia)


In [12]:
import json
import re
from transformers import pipeline
import wikipediaapi

# Define a user agent string for Wikipedia API
user_agent = "YourAppName/1.0 (https://yourwebsite.com; your-email@example.com)"
headers = {'User-Agent': user_agent}

# Initialize the Wikipedia object
wiki_wiki = wikipediaapi.Wikipedia('en', headers=headers)

# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    return text.strip()

# Function to check if the article is relevant
keywords = ['Israel', 'Hamas', 'Gaza', 'war', 'conflict']

def is_relevant(article):
    if 'content' in article:
        return any(keyword.lower() in article['content'].lower() for keyword in keywords)
    return False

# Function to fetch Wikipedia summary
def fetch_wikipedia_summary(topic):
    page = wiki_wiki.page(topic)
    if page.exists():
        return page.summary
    return None

# Initialize the QA pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# Function to answer the question
def answer_question(question, articles, additional_info):
    combined_articles = articles + [{'content': additional_info, 'source': 'Wikipedia'}]
    answers = []
    for article in combined_articles:
        answer = qa_pipeline(question=question, context=article['content'])
        answers.append((answer['answer'], answer['score'], article['source']))

    # Sort answers by score
    answers = sorted(answers, key=lambda x: x[1], reverse=True)
    return answers[0]  # Return the best answer

# Main function to handle user input and provide the best answer
def main():
    # Load the dataset
    with open('/content/drive/MyDrive/ML_Drooid/news.article.json', 'r') as f:
        data = json.load(f)

    # Clean the data
    for article in data:
        if 'content' in article:
            article['content'] = clean_text(article['content'])

    # Filter relevant articles
    filtered_articles = [article for article in data if is_relevant(article)]

    # Get additional information from Wikipedia
    additional_info = fetch_wikipedia_summary('2023 Israel-Hamas war')

    while True:
        question = input("Enter your question (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        best_answer = answer_question(question, filtered_articles, additional_info)
        print(f"Answer: {best_answer[0]} (Source: {best_answer[2]})")

# Run the main function
if __name__ == "__main__":
    main()


Enter your question (or type 'exit' to quit): what happened at the al-shifa hospital?
Answer: releasing the hostages (Source: Wikipedia)
Enter your question (or type 'exit' to quit): when did hamas war started ?
Answer: 27 October (Source: Wikipedia)
Enter your question (or type 'exit' to quit): Why did Hamas attack Israel in 2023?
Answer: continued Israeli occupation of the Palestinian territories (Source: Wikipedia)
Enter your question (or type 'exit' to quit): Why did Hamas invade Israel?
Answer: response to the continued Israeli occupation of the Palestinian territories (Source: Wikipedia)
Enter your question (or type 'exit' to quit): Is Hamas still fighting?
Answer: An armed conflict between Israel and Hamas-led Palestinian militant groups (Source: Wikipedia)
Enter your question (or type 'exit' to quit): What religion is Israel?
Answer: Islamic (Source: Wikipedia)
Enter your question (or type 'exit' to quit): How big is Gaza?
Answer: 2.3 million (Source: Wikipedia)
Enter your ques

In [13]:
import json
import re
from transformers import pipeline
import wikipediaapi

# Define a user agent string for Wikipedia API
user_agent = "YourAppName/1.0 (https://yourwebsite.com; your-email@example.com)"
headers = {'User-Agent': user_agent}

# Initialize the Wikipedia object
wiki_wiki = wikipediaapi.Wikipedia('en', headers=headers)

# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    return text.strip()

# Function to check if the article is relevant
keywords = ['Israel', 'Hamas', 'Gaza', 'war', 'conflict']

def is_relevant(article):
    if 'content' in article:
        return any(keyword.lower() in article['content'].lower() for keyword in keywords)
    return False

# Function to fetch Wikipedia summary
def fetch_wikipedia_summary(topic):
    page = wiki_wiki.page(topic)
    if page.exists():
        return page.summary
    return None

# Initialize the QA pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

# Function to answer the question
def answer_question(question, articles, additional_info):
    combined_articles = articles + [{'content': additional_info, 'source': 'Wikipedia'}]
    answers = []
    for article in combined_articles:
        chunks = [article['content'][i:i+500] for i in range(0, len(article['content']), 500)]  # Chunking large articles
        for chunk in chunks:
            answer = qa_pipeline(question=question, context=chunk)
            answers.append((answer['answer'], answer['score'], article['source']))

    # Sort answers by score
    answers = sorted(answers, key=lambda x: x[1], reverse=True)
    return answers[0]  # Return the best answer

# Main function to handle user input and provide the best answer
def main():
    # Load the dataset
    with open('/content/drive/MyDrive/ML_Drooid/news.article.json', 'r') as f:
        data = json.load(f)

    # Clean the data
    for article in data:
        if 'content' in article:
            article['content'] = clean_text(article['content'])

    # Filter relevant articles
    filtered_articles = [article for article in data if is_relevant(article)]

    # Get additional information from Wikipedia
    additional_info = fetch_wikipedia_summary('2023 Israel-Hamas war')

    while True:
        question = input("Enter your question (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        best_answer = answer_question(question, filtered_articles, additional_info)
        print(f"Answer: {best_answer[0]} (Source: {best_answer[2]})")

# Run the main function
if __name__ == "__main__":
    main()


Enter your question (or type 'exit' to quit): when did hamas war started ?
Answer: 7 October (Source: Wikipedia)
Enter your question (or type 'exit' to quit): What is Israel's goal in Gaza?
Answer: destroying Hamas and releasing the hostages (Source: Wikipedia)
Enter your question (or type 'exit' to quit): Who controls Gaza?
Answer: Israeli (Source: Wikipedia)
Enter your question (or type 'exit' to quit): Which countries support Israel?
Answer: Western allies (Source: Wikipedia)
Enter your question (or type 'exit' to quit): Who are the enemies of Hamas?
Answer: Palestinian militant groups (Source: Wikipedia)
Enter your question (or type 'exit' to quit): Is Hamas still fighting?
Answer: its attack was in response to the continued Israeli occupation of the Palestinian territories (Source: Wikipedia)
Enter your question (or type 'exit' to quit):  What religion is Israel?
Answer: Islamic (Source: Wikipedia)
Enter your question (or type 'exit' to quit):  Why did Hamas attack Israel in 2023?