## Full Code

In [None]:
import json
import re
import nltk
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import wikipediaapi
import time

with open('news-article.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': 'YourCustomUserAgent/1.0 (your-email@example.com)'})

def preprocess(text):
    text = re.sub(r'\W+', ' ', text.lower())  # Convert to lowercase and remove non-word characters
    return text

def filter_and_preprocess(data):
    relevant_articles = []
    for article in tqdm(data, desc="Filtering and Preprocessing Articles"):
        article_body = article.get('articleBody', '')  # Get the article body text
        if any(keyword in article_body for keyword in ['Israel', 'Hamas']):
            article['text'] = preprocess(article_body.lower())  # Update the article with preprocessed text
            relevant_articles.append(article)  # Append the updated article
    return relevant_articles

filtered_articles = filter_and_preprocess(data)
print(f"Filtered down to {len(filtered_articles)} relevant articles.")

tokenized_corpus = [nltk.word_tokenize(article['text']) for article in filtered_articles]

bm25 = BM25Okapi(tokenized_corpus)

def retrieve_articles(query, num_results=5):
    tokenized_query = nltk.word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    top_n_indices = scores.argsort()[-num_results:][::-1]
    return [filtered_articles[i] for i in top_n_indices]

model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

def answer_question(question, context):
    inputs = {
        'question': question,
        'context': context
    }
    return qa_pipeline(inputs)

def get_answer_from_articles(question, num_results=5):
    relevant_articles = retrieve_articles(question, num_results)
    answers = []
    for article in relevant_articles:
        answer = answer_question(question, article['text'])
        answers.append({
            'article_title': article.get('title', 'No Title'),
            'article_source': article.get('source', 'No Source'),
            'answer': answer['answer'],
            'score': answer['score'],
            'start': answer['start'],
            'end': answer['end']
        })
    sorted_answers = sorted(answers, key=lambda x: x['score'], reverse=True)
    return sorted_answers[:3]

def get_wikipedia_summary(question, max_retries=3, timeout=10, max_summary_length=1500):
    summary = ""
    for attempt in range(max_retries):
        try:
            page = wiki_wiki.page(question)
            if page.exists():
                summary += page.summary[:max_summary_length].strip() + "\n\n"
                break  
        except Exception as e:
            print(f"Error retrieving Wikipedia page for {question}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)  
            else:
                print(f"Failed to retrieve Wikipedia page for {question} after {max_retries} attempts.")
    return summary


question = input("Please enter your question: ")

wiki_summary = get_wikipedia_summary(question)
if wiki_summary:
    print("Wikipedia Summary:\n", wiki_summary)

answers = get_answer_from_articles(question)
if answers:
    print("Top 3 Answers from Articles:")
    for idx, answer in enumerate(answers):
        print(f"Answer {idx + 1}:")
        print(f"Answer: {answer['answer']}")
        print(f"Article Title: {answer['article_title']}")
        print(f"Article Source: {answer['article_source']}")
        print("=" * 50)
else:
    print("No relevant answers found from articles.")


## Step 1 : Import Required Libraries

In [None]:
import json
import re
import nltk
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import wikipediaapi
import time

Explaination:

- json: Used for reading JSON data.
- re: Provides support for regular expressions, used for text preprocessing.
- nltk: Natural Language Toolkit for various NLP tasks like tokenization.
- tqdm: Progress bar library for visualizing progress during filtering and preprocessing.
- "BM25Okapi" from "rank_bm25": BM25 algorithm for information retrieval.
- "pipeline", "AutoTokenizer", "AutoModelForQuestionAnswering" from "transformers": Essential components for working with Hugging Face's
   Transformers library, including pre-trained models and pipelines.
- wikipediaapi: API for interacting with Wikipedia.
- time: Used for adding delays between retries when fetching Wikipedia data.

## Step 2 : Load Ddata from news-article.json File

In [None]:
with open('news-article.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

## Step 3 : Initialize Wikipedia API

In [None]:
wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': 'YourCustomUserAgent/1.0 (your-email@example.com)'})


## Step 4 : Define Preprocessing Function

In [None]:
def preprocess(text):
    text = re.sub(r'\W+', ' ', text.lower())
    return text


## Step 5 : Filter and Preprocess Articles

In [None]:
def filter_and_preprocess(data):
    relevant_articles = []
    for article in tqdm(data, desc="Filtering and Preprocessing Articles"):
        article_body = article.get('articleBody', '')  
        if any(keyword in article_body for keyword in ['Israel', 'Hamas']):
            article['text'] = preprocess(article_body.lower()) 
            relevant_articles.append(article)  
    return relevant_articles

filtered_articles = filter_and_preprocess(data)
print(f"Filtered down to {len(filtered_articles)} relevant articles.")

Explaination:

- Filters articles based on whether they contain keywords 'Israel' or 'Hamas' in the article body.
- Preprocesses the filtered articles by converting text to lowercase and removing non-word characters.
- Returns a list of relevant articles.

## Step 6 : Tokenize Articles, Initialize BM25 Model and Initialize Question Answering Pipeline

In [None]:
tokenized_corpus = [nltk.word_tokenize(article['text']) for article in filtered_articles]
bm25 = BM25Okapi(tokenized_corpus)

## Step 7 : Retrieve Articles Based on Query

In [None]:
def retrieve_articles(query, num_results=5):
    tokenized_query = nltk.word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    top_n_indices = scores.argsort()[-num_results:][::-1]
    return [filtered_articles[i] for i in top_n_indices]


Explaination:

- Takes a query and retrieves top articles related to the query using BM25 scoring. Returns a list of relevant articles from filtered_articles.

## Step 8 : Initializing Question Answering Pipeline

In [None]:
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

Explaination:

- Specifies a fine-tuned BERT model (bert-large-uncased-whole-word-masking-finetuned-squad) for question answering using AutoTokenizer and AutoModelForQuestionAnswering from Transformers.

## Step 9 : Answer Question Using Articles

In [None]:
def answer_question(question, context):
    inputs = {
        'question': question,
        'context': context
    }
    return qa_pipeline(inputs)


## Step 10 : Getting Answers from Articles

In [None]:
def get_answer_from_articles(question, num_results=5):
    relevant_articles = retrieve_articles(question, num_results)
    answers = []
    for article in relevant_articles:
        answer = answer_question(question, article['text'])
        answers.append({
            'article_title': article.get('title', 'No Title'),
            'article_source': article.get('source', 'No Source'),
            'answer': answer['answer'],
            'score': answer['score'],
            'start': answer['start'],
            'end': answer['end']
        })
    sorted_answers = sorted(answers, key=lambda x: x['score'], reverse=True)
    return sorted_answers[:3]


Explaination:

- Retrieves top articles related to a given question using retrieve_articles.
- For each article, applies answer_question to extract the best answer and stores relevant information (article_title, article_source, answer, score, start, end) in answers.
- Sorts answers based on the answer's score in descending order and returns the top 3 answers.


## Step 11 : Wikipedia Summery

In [None]:
def get_wikipedia_summary(question, max_retries=3, timeout=10, max_summary_length=1500):
    summary = ""
    for attempt in range(max_retries):
        try:
            page = wiki_wiki.page(question)
            if page.exists():
                summary += page.summary[:max_summary_length].strip() + "\n\n"
                break  
        except Exception as e:
            print(f"Error retrieving Wikipedia page for {question}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)  
            else:
                print(f"Failed to retrieve Wikipedia page for {question} after {max_retries} attempts.")
    return summary

Explaination:

- Takes a question, attempts to fetch the corresponding Wikipedia page (page) using wiki_wiki.
- If the page exists, retrieves the summary and limits it to max_summary_length characters.
- Handles exceptions and retries (max_retries) with a delay (time.sleep(2)) if necessary.

## Step 12 : User Interaction

In [None]:
question = input("Please enter your question: ")

wiki_summary = get_wikipedia_summary(question)
if wiki_summary:
    print("Wikipedia Summary:\n", wiki_summary)

answers = get_answer_from_articles(question)
if answers:
    print("Top 3 Answers from Articles:")
    for idx, answer in enumerate(answers):
        print(f"Answer {idx + 1}:")
        print(f"Answer: {answer['answer']}")
        print(f"Article Title: {answer['article_title']}")
        print(f"Article Source: {answer['article_source']}")
        print("=" * 50)
else:
    print("No relevant answers found from articles.")


Explaination:

- Prompts the user to input a question.
- Retrieves and prints a Wikipedia summary based on the question.
- Retrieves and prints the top 3 answers from the filtered articles based on the question.