In [1]:
import json
import re
from transformers import pipeline
import wikipediaapi

# Load the JSON data with UTF-8 encoding
with open('C:\\Users\\hp\\Desktop\\rishu_assignment\\news.article.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

def preprocess_text(text):
    # Remove special characters and extra spaces
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Preprocess each article's text
for article in data:
    article['cleaned_text'] = preprocess_text(article['articleBody'])

# Filter relevant articles based on specific keywords
def is_relevant_article(article):
    specific_keywords = ["Al-Shifa Hospital", "Gaza hospital", "Israel airstrike", "Hamas attack", "Gaza conflict"]
    return any(keyword.lower() in article['cleaned_text'].lower() for keyword in specific_keywords)

relevant_articles = [article for article in data if is_relevant_article(article)]

# Load a pre-trained QA model
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Function to get additional context from Wikipedia with a specified user-agent
def get_wikipedia_context(topic):
    user_agent = 'MyUserAgent/1.0 (myemail@example.com)'  # Replace with your own details
    wiki_wiki = wikipediaapi.Wikipedia('en', headers={'User-Agent': user_agent})
    page = wiki_wiki.page(topic)
    if page.exists():
        return page.summary
    else:
        return ""

# Function to get combined context
def get_combined_context(relevant_articles, wiki_topic, max_length=5000):
    context = ""
    for article in relevant_articles:
        if len(context) + len(article['cleaned_text']) > max_length:
            break
        context += article['cleaned_text'] + " "
    
    wiki_context = get_wikipedia_context(wiki_topic)
    combined_context = context + " " + wiki_context
    
    # Debugging: Print combined context
    print("Combined Context: ", combined_context[:1000])  # Print first 1000 characters to check content
    
    return combined_context

# Function to get answer to a question
def get_answer(question, relevant_articles, wiki_topic):
    combined_context = get_combined_context(relevant_articles, wiki_topic)
    result = qa_pipeline(question=question, context=combined_context)
    return result['answer']

# Example question
question = "What happened at the Al-Shifa Hospital?"
wiki_topic = "Al-Shifa Hospital Gaza conflict"
answer = get_answer(question, relevant_articles, wiki_topic)
print("Answer: ", answer)
