In [None]:
!pip install gdown transformers tabulate

import gdown
import json
import re
from collections import defaultdict
import datetime
import matplotlib.pyplot as plt
from transformers import pipeline, BartForConditionalGeneration, BartTokenizer
from tabulate import tabulate

file_url = "https://drive.google.com/uc?export=download&id=1q6KVw4LD_rnXKVViVkBpdTyedHdQvtkk"
output = 'news_articles.json'
gdown.download(file_url, output, quiet=False)

In [None]:
def summarize_titles(titles, max_length=20):
    model_name = "facebook/bart-large-cnn"
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
    
    summaries = []
    chunk_size = min(5, len(titles))  
    for title_chunk in [titles[i:i+chunk_size] for i in range(0, len(titles), chunk_size)]:
        title_chunk = [title[:512] for title in title_chunk] 
        inputs = tokenizer.prepare_seq2seq_batch(title_chunk, truncation=True, padding='longest', return_tensors="pt")
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=max_length, length_penalty=2.0, early_stopping=True)
        chunk_summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
        summaries.extend(chunk_summaries)
    
    return summaries
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
nlp = pipeline("ner", model=model_name, aggregation_strategy="simple")

In [None]:
with open('news_articles.json') as f:
    articles = json.load(f)
print(f"Loaded {len(articles)} articles.")

In [None]:
def clean_text(text):
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text = text.lower()
    return text

for article in articles:
    article['cleaned_text'] = clean_text(article['articleBody'])

print("Text cleaning complete.")

In [None]:
keywords = ["israel", "hamas", "gaza", "palestine", "war","airstrikes","rafah","yemen"]

def is_relevant(article):
    for keyword in keywords:
        if keyword in article['cleaned_text']:
            return True
    return False

relevant_articles = [article for article in articles if is_relevant(article)]

print(f"Found {len(relevant_articles)} relevant articles.")


In [None]:


def extract_events(text):
    events = nlp(text)
    return events

for article in relevant_articles:
    article['events'] = extract_events(article['cleaned_text'])

print("Event extraction complete.")

In [None]:
timeline = defaultdict(list)

for article in relevant_articles:
    if 'dateModified' in article and '$date' in article['dateModified']:
        date = article['dateModified']['$date'][:10]
    elif 'scrapedDate' in article and '$date' in article['scrapedDate']:
        date = article['scrapedDate']['$date'][:10]
    else:
        continue
    
    date_obj = datetime.datetime.strptime(date, '%Y-%m-%d')
    for event in article['events']:
        event_name = article['title']  # Use article title as event name
        timeline[date_obj].append((event_name, event))

print("Timeline creation complete.")

In [None]:
timeline_table = []

for date, events in timeline.items():
    date_str = date.strftime('%Y-%m-%d')
    num_events = len(events)
    titles = [event[0] for event in events]  
    combined_title = ", ".join(titles)
    summary = summarize_titles([combined_title])[0]
    timeline_table.append([date_str, num_events, summary])
    #timeline_table.append([date_str,  summary])

print(tabulate(timeline_table, headers=['Date', 'Number of Events',"Event Occured"], tablefmt='grid'))
