In [None]:
import pandas as pd
data = pd.read_csv("news.csv")
data.head()

Unnamed: 0,Company Name,Event Title,URL,Date,Event Description
0,Reliance Industries,First Dividend Post-Bonus Issue Announcement,https://www.angelone.in/news/market-updates/re...,2025-09-01,Reliance Industries declared ₹5.50 dividend pe...
1,Reliance Industries,Annual General Meeting 2025,https://upstox.com/news/business-news/latest-u...,2025-08-29,RIL AGM 2025 highlighted key announcements inc...
2,Reliance Industries,Partnership with Google Cloud for AI Solutions,https://www.ril.com,2025-09-04,Reliance and Google Cloud expanded partnership...
3,Reliance Industries,Institutional Investors Meeting - Goldman Sach...,https://economictimes.com/reliance-industries-...,2025-09-04,Company executives participated in Goldman Sac...
4,Reliance Industries,Upcoming CITIC CLSA Investors Forum 2025,https://economictimes.com/reliance-industries-...,2025-09-08,RIL executives scheduled to participate in ins...


In [None]:
data.shape

(120, 5)

In [None]:
events = data["Event Description"]
# events.head(3)
events[119]

'Bajaj Auto scheduled analyst conference call for September 2, 2025, for investor communication and business performance discussions.'

In [None]:
urls = data["URL"]
urls.head(3)

Unnamed: 0,URL
0,https://www.angelone.in/news/market-updates/re...
1,https://upstox.com/news/business-news/latest-u...
2,https://www.ril.com


In [None]:
# Make sure you have the required libraries installed:
# pip install transformers
# pip install torch
# pip install sentencepiece
# pip install requests
# pip install beautifulsoup4

from transformers import pipeline
import json
import requests
from bs4 import BeautifulSoup

def get_event_from_url(url, headline):
    """
    Fetches a news article from a URL, extracts the main event using an LLM,
    and returns the details in a JSON object.

    Args:
        url (str): The URL of the news article.
        headline (str): The headline of the news article.

    Returns:
        dict: A dictionary containing the URL, headline, and the extracted event.
              Returns an error dictionary if something goes wrong.
    """
    try:
        # 1. Fetch the article content from the web.
        print(f"Fetching article from: {url}...")
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        # 2. Parse the HTML and extract the main article text.
        # This part is generic and might need adjustments for different website structures.
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text() for p in paragraphs])

        if not article_text:
            return {"error": "Could not find any paragraph text on the page."}

        print("Article text extracted successfully.")

        # 3. Initialize the language model.
        print("Initializing the text-to-text generation model (flan-t5-base)...")
        extractor = pipeline('text2text-generation', model='google/pegasus-cnn_dailymail')
        print("Model initialized successfully.")

        # 4. Create a prompt to ask the model to extract the event.
        prompt = f"""
        Read the following news article and extract the main event described in it.

        Article: "{article_text}"

        Extracted Event:
        """

        # 5. Get the event from the model.
        print("Extracting the news event from the text...")
        outputs = extractor(prompt, max_new_tokens=512, clean_up_tokenization_spaces=True)
        event = outputs[0]['generated_text'].strip()
        print("Event extracted successfully.")

        # 6. Assemble and return the final JSON object.
        result = {
            "news_url": url,
            "news_headline": headline,
            "news_event": event
        }
        return result

    except requests.exceptions.RequestException as e:
        return {"error": f"Failed to fetch the URL: {e}"}
    except Exception as e:
        return {"error": f"An unexpected error occurred: {e}"}

# --- Example Usage ---
if __name__ == "__main__":
   for i in range(120):
    # You can change this URL and headline to any news article.
    sample_url = urls[i]
    sample_headline = events[i]

    # Get the structured event data
    event_data = get_event_from_url(sample_url, sample_headline)

    # Print the final, structured JSON object in a readable format
    print("\n--- Extracted Event (JSON Format) ---")
    print(json.dumps(event_data, indent=4))




In [None]:
from transformers import pipeline, AutoTokenizer
import json
import requests
from bs4 import BeautifulSoup

# Load model and tokenizer once
model_name = 'mrm8488/t5-base-finetuned-summarize-news'
extractor = pipeline('text2text-generation', model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_length = 512  # max tokens for flan-t5-base

def get_event_from_url(url, headline, extractor, tokenizer, max_length):
    try:
        print(f"Fetching article from: {url}...")
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text() for p in paragraphs])

        if not article_text:
            return {"news_url": url, "news_headline": headline, "error": "Could not find any paragraph text on the page."}

        # Truncate article text to max tokens for model input
        inputs = tokenizer(article_text, max_length=max_length, truncation=True, return_tensors="pt")
        truncated_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)

        prompt = f"""
        Read the following news article and extract the main event described in it.

        Article: "{truncated_text}"

        Extracted Event:
        """

        outputs = extractor(prompt, max_new_tokens=150)
        event = outputs[0]['generated_text'].strip()

        result = {
            "news_url": url,
            "news_headline": headline,
            "news_event": event
        }
        return result

    except requests.exceptions.RequestException as e:
        return {"news_url": url, "news_headline": headline, "error": f"Failed to fetch the URL: {e}"}
    except Exception as e:
        return {"news_url": url, "news_headline": headline, "error": f"An unexpected error occurred: {e}"}

if __name__ == "__main__":
    # Example lists; replace these with your actual URLs and headlines
    urls = urls
    events = events

    # Validate URLs to include only those with proper scheme
    valid_urls = [url for url in urls if url.startswith('http')]
    valid_headlines = [events[i] for i, url in enumerate(urls) if url.startswith('http')]

    all_event_data = []

    for i, sample_url in enumerate(valid_urls):
        sample_headline = valid_headlines[i]
        event_data = get_event_from_url(sample_url, sample_headline, extractor, tokenizer, max_length)
        print(json.dumps(event_data, indent=4))
        all_event_data.append(event_data)

    # Save all event data to JSON file
    with open("extracted_events.json", "w", encoding="utf-8") as f:
        json.dump(all_event_data, f, ensure_ascii=False, indent=2)

    print("All extracted events saved to extracted_events.json")


Device set to use cpu


Fetching article from: https://www.angelone.in/news/market-updates/reliance-industries-dividend-2025-first-5-50-payout-post-bonus-issue-confirmed-at-agm...
{
    "news_url": "https://www.angelone.in/news/market-updates/reliance-industries-dividend-2025-first-5-50-payout-post-bonus-issue-confirmed-at-agm",
    "news_headline": "Reliance Industries declared \u20b95.50 dividend per share for FY25, marking its first dividend after 1:1 bonus issue in October 2024, with overwhelming shareholder support of 99.9994%.",
    "news_event": "Reliance Industries has announced its first dividend payout of 5.50 per share for FY 2024-25, marking its initial return following the 1:1 bonus issue in October 2024. The final dividend of 5.50 per equity share of 10 face value for the year ending March 31, 2025 will be declared within a week. The stock registered a gain of 1.37% over the past week. However, it has declined by 0.90% over the past month and 3 months.nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn

Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors


{
    "news_url": "https://www.ril.com",
    "news_headline": "Reliance and Google Cloud expanded partnership to accelerate India's growth with AI, focusing on enterprise AI solutions and digital transformation.",
    "news_event": "Reliance Retail is the largest producer of petrochemicals in the country and among the top 10 in the world. Jio, our digital services business, democratises access to digital services for millions of people. Reliance is committed to helping India achieve a net-zero carbon target by 2035. The media and entertainment business is pan-India."
}
Fetching article from: https://economictimes.com/reliance-industries-ltd/stocksupdate/companyid-13215.cms...
{
    "news_url": "https://economictimes.com/reliance-industries-ltd/stocksupdate/companyid-13215.cms",
    "news_headline": "Company executives participated in Goldman Sachs conference with one-on-one investor meetings, with no unpublished price sensitive information shared.",
    "news_event": "the combined mark