## Full Code

In [None]:
# Task 1 : Import the Required Libraries
import json
import re
import nltk
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import wikipediaapi
import time

# Step 2: Load Ddata from news-article.json File
with open('news-article.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Step 3: Initialize Wikipedia API
wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': 'YourCustomUserAgent/1.0 (your-email@example.com)'})

#Step 4: Define Pre-processing Function
def preprocess(text):
    text = re.sub(r'\W+', ' ', text.lower())  
    return text

# Step 5: Filter and Pre-process Articles
def filter_and_preprocess(data):
    relevant_articles = []
    for article in tqdm(data, desc="Filtering and Preprocessing Articles"):
        article_body = article.get('articleBody', '')  
        if any(keyword in article_body for keyword in ['Israel', 'Hamas']):
            article['text'] = preprocess(article_body.lower())  
            relevant_articles.append(article)  
    return relevant_articles

filtered_articles = filter_and_preprocess(data)
print(f"Filtered down to {len(filtered_articles)} relevant articles.")

# Step 6: Tokenize Articles, Initialize BM25 Model and Initialize Question Answering Pipeline
tokenized_corpus = [nltk.word_tokenize(article['text']) for article in filtered_articles]

bm25 = BM25Okapi(tokenized_corpus)

# Step 7: Retrieve Articles Based on Query
def retrieve_articles(query, num_results=5):
    tokenized_query = nltk.word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    top_n_indices = scores.argsort()[-num_results:][::-1]
    return [filtered_articles[i] for i in top_n_indices]

# Step 8: Initializing Question Answering Pipeline
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Step 9: Answer Question Using Articles
def answer_question(question, context):
    inputs = {
        'question': question,
        'context': context
    }
    return qa_pipeline(inputs)

# Step 10: Getting Answers from Articles
def get_answer_from_articles(question, num_results=5):
    relevant_articles = retrieve_articles(question, num_results)
    answers = []
    for article in relevant_articles:
        answer = answer_question(question, article['text'])
        answers.append({
            'article_title': article.get('title', 'No Title'),
            'article_source': article.get('source', 'No Source'),
            'answer': answer['answer'],
            'score': answer['score'],
            'start': answer['start'],
            'end': answer['end']
        })
    sorted_answers = sorted(answers, key=lambda x: x['score'], reverse=True)
    return sorted_answers[:3]

# Step 11: Wikipedia Summery
def get_wikipedia_summary(question, max_retries=3, timeout=10, max_summary_length=1500):
    summary = ""
    for attempt in range(max_retries):
        try:
            page = wiki_wiki.page(question)
            if page.exists():
                summary += page.summary[:max_summary_length].strip() + "\n\n"
                break  
        except Exception as e:
            print(f"Error retrieving Wikipedia page for {question}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)  
            else:
                print(f"Failed to retrieve Wikipedia page for {question} after {max_retries} attempts.")
    return summary


# Step 12: User Interaction
question = input("Please enter your question: ")

wiki_summary = get_wikipedia_summary(question)
if wiki_summary:
    print("Wikipedia Summary:\n", wiki_summary)

answers = get_answer_from_articles(question)
if answers:
    print("Top 3 Answers from Articles:")
    for idx, answer in enumerate(answers):
        print(f"Answer {idx + 1}:")
        print(f"Answer: {answer['answer']}")
        print(f"Article Title: {answer['article_title']}")
        print(f"Article Source: {answer['article_source']}")
        print("=" * 50)
else:
    print("No relevant answers found from articles.")


 ## Check Process.docx file to get detailed Explaination