In [None]:
!pip install feedparser TextBlob
!pip install nltk transformers torch

To do:
1. Check if the database is storing all news properly
2. Implement the Data Analysis part


In [None]:
import sqlite3
import pandas as pd

def check_database_entries():
    # Connect to the SQLite database
    conn = sqlite3.connect('news_sentiment.db')

    # Create a cursor object
    cursor = conn.cursor()

    # Get the number of entries in the database
    cursor.execute("SELECT COUNT(*) FROM sentiment_scores")
    count = cursor.fetchone()[0]
    print(f"Total number of entries in the database: {count}")

    # Fetch all entries
    df = pd.read_sql_query("SELECT * FROM sentiment_scores", conn)

    # Display the first few rows
    print("\nFirst few entries:")
    print(df.head())

    # Display summary statistics
    print("\nSummary statistics of sentiment scores:")
    print(df['score'].describe())

    # Check for any null values
    null_counts = df.isnull().sum()
    if null_counts.sum() > 0:
        print("\nWarning: Null values found in the following columns:")
        print(null_counts[null_counts > 0])
    else:
        print("\nNo null values found in the database.")

    # Check the date range of the entries
    print(f"\nDate range of entries:")
    print(f"Earliest date: {df['date'].min()}")
    print(f"Latest date: {df['date'].max()}")

    # Close the connection
    conn.close()

# Run the database check
check_database_entries()

Total number of entries in the database: 209

First few entries:
         date      time                                              title  \
0  2024-10-09  07:01:27  'Jalebi teri, Haryana mera': 'Sweet' turn of e...   
1  2024-10-09  12:35:05  'Art of turning victory into defeat ... ': Shi...   
2  2024-10-09  12:34:34  Exclusive: 'Lack of intent from India in T20 W...   
3  2024-10-09  12:05:30  ‘Analysing unexpected results’: Rahul’s first ...   
4  2024-10-09  13:50:42  Hiring at engineering colleges to jump 25%: TC...   

                                             summary     score  
0  BJP's delight in the Haryana poll results saw ...  5.712750  
1  Shiv Sena's Saamana editorial criticizes Congr...  2.437750  
2  India's Women's T20 World Cup campaign faces a...  5.733875  
3  Rahul Gandhi thanked Jammu and Kashmir residen...  2.451250  
4  India's IT sector is set to see a 20-25% incre...  2.265750  

Summary statistics of sentiment scores:
count    209.000000
mean       4.68

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import requests
import feedparser
import sqlite3
from datetime import datetime

# Download necessary NLTK data
nltk.download('vader_lexicon', quiet=True)

# Initialize NLTK's VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Load pre-trained FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Load the additional transformers models
roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Function to perform sentiment analysis using RoBERTa
def roberta_sentiment_analysis(text):
    inputs = roberta_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = roberta_model(**inputs)
    roberta_probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    roberta_sentiment = torch.argmax(roberta_probs).item()
    return roberta_sentiment * 5, torch.max(roberta_probs).item()

# BART summarization for long texts
def generate_summary(text):
    if len(text) > 1000:
        summary = summarizer(text, max_length=100, min_length=30, do_sample=False)
        return summary[0]['summary_text']
    return text

# Main sentiment analysis function
def analyze_sentiment(text):
    # Handle missing or short summaries
    if not text or len(text.strip()) < 10:
        return 5.0  # Neutral score for missing or insufficient text

    # Generate summary if the text is too long
    summarized_text = generate_summary(text)

    # VADER sentiment analysis
    vader_score = sia.polarity_scores(summarized_text)
    normalized_vader_score = (vader_score['compound'] + 1) * 5

    # FinBERT sentiment analysis
    inputs = tokenizer(summarized_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    finbert_probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    finbert_sentiment = torch.argmax(finbert_probs).item()
    normalized_finbert_score = finbert_sentiment * 5

    # RoBERTa sentiment analysis
    roberta_score, roberta_confidence = roberta_sentiment_analysis(summarized_text)

    # Confidence-based adjustment
    finbert_confidence = torch.max(finbert_probs).item()  # Confidence of the highest FinBERT class
    vader_confidence = abs(vader_score['compound'])  # VADER confidence derived from compound score
    total_confidence = finbert_confidence + vader_confidence + roberta_confidence

    # Assign weights based on confidence
    finbert_weight = finbert_confidence / total_confidence
    vader_weight = vader_confidence / total_confidence
    roberta_weight = roberta_confidence / total_confidence

    # Combine the scores based on weighted confidence
    combined_score = (
        (vader_weight * normalized_vader_score) +
        (finbert_weight * normalized_finbert_score) +
        (roberta_weight * roberta_score)
    )

    # Ensure the combined score remains between 0 and 10
    combined_score = max(0, min(combined_score, 10))

    return post_process_sentiment_score(summarized_text, combined_score)

# Post-processing to handle edge cases
def post_process_sentiment_score(text, score):
    # Check for certain keywords that may skew sentiment (e.g., hospital, critical, death)
    negative_keywords = ['hospital', 'intensive care', 'critical', 'death', 'emergency']
    positive_keywords = ['good spirits', 'recovery', 'improving', 'healthy']

    for word in negative_keywords:
        if word in text.lower():
            score = min(score, 4.0)  # Cap the score on the lower side

    for word in positive_keywords:
        if word in text.lower() and score < 5:
            score = max(score, 6.0)  # Raise the score slightly if positive keywords found

    return score

# Database setup
def setup_database():
    conn = sqlite3.connect('news_sentiment.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS sentiment_scores
                 (date TEXT, time TEXT, title TEXT, summary TEXT, score REAL)''')
    conn.commit()
    return conn

# Store the sentiment score in the database
def store_score(conn, date, time, title, summary, score):
    c = conn.cursor()
    c.execute("INSERT INTO sentiment_scores VALUES (?, ?, ?, ?, ?)",
              (date, time, title, summary, score))
    conn.commit()

# Fetch news and analyze sentiment
def fetch_news_and_analyze(url, conn):
    response = requests.get(url)
    feed = feedparser.parse(response.content)

    for entry in feed.entries:
        title = entry.title
        summary = entry.summary
        published = entry.published

        # Parse the published date and time
        dt = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S %z")
        date = dt.strftime("%Y-%m-%d")
        time = dt.strftime("%H:%M:%S")

        score = analyze_sentiment(summary)

        print(f"Title: {title}")
        print(f"Published: {published}")
        print(f"Summary: {summary}")
        print(f"Sentiment Score: {score:.2f}")
        print("---")

        # Store in the database
        store_score(conn, date, time, title, summary, score)

# Get the daily average sentiment score
def get_daily_average(conn, date):
    c = conn.cursor()
    c.execute("SELECT AVG(score) FROM sentiment_scores WHERE date = ?", (date,))
    return c.fetchone()[0]

# Main script
url = "https://timesofindia.indiatimes.com/rssfeedstopstories.cms"

# Setup the database
conn = setup_database()

# Fetch news, analyze sentiment, and store in database
fetch_news_and_analyze(url, conn)

# Get today's date
today = datetime.now().strftime("%Y-%m-%d")

# Calculate and print daily average
daily_avg = get_daily_average(conn, today)
print(f"Average sentiment score for today ({today}): {daily_avg:.2f}")

# Close the database connection
conn.close()

print("Data has been stored in the 'news_sentiment.db' database.")




Title: ‘Anyone threatens India, he’s a total killer’: Trump on PM Modi
Published: Wed, 09 Oct 2024 20:49:22 +0530
Summary: During an appearance on the Flagrant Podcast, Donald Trump spoke highly of Indian Prime Minister Narendra Modi, calling him a "friend" and praising his leadership as stabilising India. Trump described Modi as a "killer" in political dealings despite his calm exterior. Reflecting on the 2019 Howdy Modi event in Houston, Trump recalled the large turnout and their shared resolve during crises. He also noted that when India was threatened, Modi would adopt a determined stance, assuring Trump that he would "do anything necessary" to defend the country.
Sentiment Score: 7.00
---
Title: Congress's 'alliance of convenience' comes under fire after Haryana debacle
Published: Wed, 09 Oct 2024 21:16:30 +0530
Summary: 
Sentiment Score: 5.00
---
Title: Delhi CM office says Atishi's belongings removed from official house on LG's order
Published: Wed, 09 Oct 2024 18:29:29 +0530
Su