In [8]:
!pip install feedparser TextBlob
!pip install nltk transformers torch



In [13]:
import nltk
import torch
import requests
import feedparser
import sqlite3
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline
)
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
import logging
from typing import List, Tuple, Set
from wordcloud import WordCloud
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
DATABASE_NAME = 'news_sentiment.db'
NEWS_FEED_URL = "https://timesofindia.indiatimes.com/rssfeedstopstories.cms"
DATE_FORMAT = "%Y-%m-%d"
DATETIME_FORMAT = "%a, %d %b %Y %H:%M:%S %z"

class DatabaseManager:
    def __init__(self, db_name: str):
        self.db_name = db_name
        self._setup_database()
        self.connection = None
        self._setup_connection()

    def _setup_database(self):
        with sqlite3.connect(self.db_name) as conn:
            conn.execute('''
                CREATE TABLE IF NOT EXISTS sentiment_scores (
                    date TEXT,
                    time TEXT,
                    title TEXT,
                    summary TEXT,
                    score REAL
                )
            ''')
            # Add indexes for better query performance
            conn.execute('CREATE INDEX IF NOT EXISTS idx_date ON sentiment_scores(date)')
            conn.execute('CREATE INDEX IF NOT EXISTS idx_title ON sentiment_scores(title)')
            conn.execute('CREATE INDEX IF NOT EXISTS idx_score ON sentiment_scores(score)')

    def _setup_connection(self):
        """Setup persistent connection with optimized parameters"""
        if not self.connection:
            self.connection = sqlite3.connect(self.db_name, check_same_thread=False)
            self.connection.execute('PRAGMA journal_mode=WAL')  # Write-Ahead Logging
            self.connection.execute('PRAGMA synchronous=NORMAL')
            self.connection.execute('PRAGMA cache_size=-2000')  # 2MB cache
            self.connection.execute('PRAGMA temp_store=MEMORY')

    def store_score(self, date: str, time: str, title: str, summary: str, score: float):
        try:
            self.connection.execute(
                "INSERT INTO sentiment_scores VALUES (?, ?, ?, ?, ?)",
                (date, time, title, summary, score)
            )
            self.connection.commit()
        except Exception as e:
            logging.error(f"Error storing score: {e}")

    @lru_cache(maxsize=128)
    def get_daily_average(self, date: str) -> float:
        try:
            result = self.connection.execute(
                "SELECT AVG(score) FROM sentiment_scores WHERE date = ?",
                (date,)
            ).fetchone()
            return result[0] if result[0] is not None else 0.0
        except Exception as e:
            logging.error(f"Error getting daily average: {e}")
            return 0.0

    def get_headlines_with_scores(self, limit: int = 10) -> pd.DataFrame:
        """Retrieve recent headlines with their sentiment scores"""
        try:
            query = """
                SELECT date, time, title, score
                FROM sentiment_scores
                ORDER BY date DESC, time DESC
                LIMIT ?
            """
            return pd.read_sql_query(query, self.connection, params=(limit,))
        except Exception as e:
            logging.error(f"Error retrieving headlines: {e}")
            return pd.DataFrame()

    def get_extreme_sentiment_headlines(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Get headlines with highest and lowest sentiment scores"""
        try:
            most_positive = pd.read_sql_query("""
                SELECT date, time, title, score
                FROM sentiment_scores
                ORDER BY score DESC
                LIMIT 5
            """, self.connection)

            most_negative = pd.read_sql_query("""
                SELECT date, time, title, score
                FROM sentiment_scores
                ORDER BY score ASC
                LIMIT 5
            """, self.connection)

            return most_positive, most_negative
        except Exception as e:
            logging.error(f"Error retrieving extreme headlines: {e}")
            return pd.DataFrame(), pd.DataFrame()

    def get_headline_stats(self) -> dict:
        """Get statistical information about headlines"""
        try:
            stats = {}

            # Get average sentiment score
            avg_query = "SELECT AVG(score) FROM sentiment_scores"
            stats['average_sentiment'] = self.connection.execute(avg_query).fetchone()[0]

            # Get total number of headlines
            count_query = "SELECT COUNT(*) FROM sentiment_scores"
            stats['total_headlines'] = self.connection.execute(count_query).fetchone()[0]

            # Get date range
            range_query = """
                SELECT
                    MIN(date) as earliest_date,
                    MAX(date) as latest_date
                FROM sentiment_scores
            """
            earliest, latest = self.connection.execute(range_query).fetchone()
            stats['date_range'] = f"{earliest} to {latest}"

            return stats
        except Exception as e:
            logging.error(f"Error getting headline stats: {e}")
            return {}

    def close(self):
        if self.connection:
            self.connection.close()

class SentimentAnalyzer:
    def __init__(self):
        self._initialize_models()
        self._setup_device()

    def _initialize_models(self):
        # Download NLTK data only if not already present
        for resource in ['vader_lexicon', 'punkt', 'stopwords']:
            try:
                nltk.data.find(f'tokenizers/{resource}')
            except LookupError:
                nltk.download(resource, quiet=True)

        self.sia = SentimentIntensityAnalyzer()

        # Initialize models with better memory management
        self.finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        self.finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        self.roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
        self.roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    def _setup_device(self):
        """Setup device and move models to it"""
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.finbert_model.to(self.device)
        self.roberta_model.to(self.device)

    @lru_cache(maxsize=1024)
    def analyze_sentiment(self, text: str) -> float:
        if not text or len(text.strip()) < 10:
            return 5.0

        try:
            # Run sentiment analysis in parallel
            with ThreadPoolExecutor(max_workers=3) as executor:
                vader_future = executor.submit(self._get_vader_score, text)
                finbert_future = executor.submit(self._get_finbert_score, text)
                roberta_future = executor.submit(self._get_roberta_score, text)

                vader_score, vader_confidence = vader_future.result()
                finbert_score, finbert_confidence = finbert_future.result()
                roberta_score, roberta_confidence = roberta_future.result()

            # Calculate weighted average
            weights = np.array([vader_confidence, finbert_confidence, roberta_confidence])
            weights = weights / weights.sum()  # Normalize weights
            scores = np.array([vader_score, finbert_score, roberta_score])

            combined_score = np.dot(weights, scores)
            return max(0, min(combined_score, 10))

        except Exception as e:
            logging.error(f"Error in sentiment analysis: {e}")
            return 5.0

    def _get_vader_score(self, text: str) -> Tuple[float, float]:
        scores = self.sia.polarity_scores(text)
        return (scores['compound'] + 1) * 5, abs(scores['compound'])

    def _get_finbert_score(self, text: str) -> Tuple[float, float]:
        with torch.no_grad():
            inputs = self.finbert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
            outputs = self.finbert_model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=1)
            sentiment = torch.argmax(probs).item()
            return sentiment * 5, torch.max(probs).item()

    def _get_roberta_score(self, text: str) -> Tuple[float, float]:
        with torch.no_grad():
            inputs = self.roberta_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
            outputs = self.roberta_model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=1)
            sentiment = torch.argmax(probs).item()
            return sentiment * 5, torch.max(probs).item()


class DataVisualizer:
    @staticmethod
    def _add_daily_counts(fig, df):
        """Add daily article count visualization"""
        daily_counts = df.groupby('date').size()
        fig.add_trace(
            go.Scatter(
                x=daily_counts.index,
                y=daily_counts.values,
                mode='lines+markers',
                name='Daily Entries',
                line=dict(width=2),
                marker=dict(size=6)
            ),
            row=1, col=1
        )
        fig.update_xaxes(title_text="Date", row=1, col=1)
        fig.update_yaxes(title_text="Number of Articles", row=1, col=1)

    @staticmethod
    def _add_hourly_distribution(fig, df):
        """Add hourly distribution visualization"""
        hourly_counts = df['hour'].value_counts().sort_index()
        fig.add_trace(
            go.Bar(
                x=hourly_counts.index,
                y=hourly_counts.values,
                name='Hourly Distribution',
                marker_color='lightblue'
            ),
            row=1, col=2
        )
        fig.update_xaxes(title_text="Hour of Day", row=1, col=2)
        fig.update_yaxes(title_text="Number of Articles", row=1, col=2)

    @staticmethod
    def _add_sentiment_timeline(fig, df):
        """Add sentiment timeline visualization"""
        daily_sentiment = df.groupby('date')['score'].mean()
        fig.add_trace(
            go.Scatter(
                x=daily_sentiment.index,
                y=daily_sentiment.values,
                mode='lines',
                name='Daily Sentiment',
                line=dict(color='green', width=2)
            ),
            row=1, col=3
        )
        fig.update_xaxes(title_text="Date", row=1, col=3)
        fig.update_yaxes(title_text="Average Sentiment Score", row=1, col=3)

    @staticmethod
    def _add_summary_distribution(fig, df):
        """Add summary length distribution visualization"""
        fig.add_trace(
            go.Histogram(
                x=df['summary'].str.len(),
                name='Summary Lengths',
                nbinsx=30,
                marker_color='lightgreen'
            ),
            row=2, col=1
        )
        fig.update_xaxes(title_text="Summary Length (characters)", row=2, col=1)
        fig.update_yaxes(title_text="Frequency", row=2, col=1)

    @staticmethod
    def _add_sentiment_distribution(fig, df):
        """Add sentiment score distribution visualization"""
        fig.add_trace(
            go.Histogram(
                x=df['score'],
                name='Sentiment Distribution',
                nbinsx=20,
                histnorm='probability',
                marker_color='coral'
            ),
            row=2, col=2
        )
        fig.update_xaxes(title_text="Sentiment Score", row=2, col=2)
        fig.update_yaxes(title_text="Probability", row=2, col=2)

    @staticmethod
    def _add_weekly_patterns(fig, df):
        """Add weekly sentiment patterns visualization"""
        df['weekday'] = df['date'].dt.day_name()
        weekly_sentiment = df.pivot_table(
            values='score',
            index=df['weekday'],
            columns=df['hour'],
            aggfunc='mean'
        )

        fig.add_trace(
            go.Heatmap(
                z=weekly_sentiment.values,
                x=weekly_sentiment.columns,
                y=weekly_sentiment.index,
                colorscale='RdYlBu',
                name='Weekly Patterns'
            ),
            row=2, col=3
        )
        fig.update_xaxes(title_text="Hour of Day", row=2, col=3)
        fig.update_yaxes(title_text="Day of Week", row=2, col=3)

    @staticmethod
    def _add_sentiment_moving_average(fig, df):
        """Add sentiment moving average visualization"""
        daily_sentiment = df.groupby('date')['score'].mean().reset_index()
        daily_sentiment['MA7'] = daily_sentiment['score'].rolling(window=7).mean()

        fig.add_trace(
            go.Scatter(
                x=daily_sentiment['date'],
                y=daily_sentiment['MA7'],
                mode='lines',
                name='7-Day Moving Average',
                line=dict(color='purple', width=3)
            ),
            row=3, col=1
        )
        fig.update_xaxes(title_text="Date", row=3, col=1)
        fig.update_yaxes(title_text="7-Day Moving Average Score", row=3, col=1)

    @staticmethod
    def _add_headline_length_vs_sentiment(fig, df):
        """Add headline length vs sentiment visualization"""
        df['title_length'] = df['title'].str.len()

        fig.add_trace(
            go.Scatter(
                x=df['title_length'],
                y=df['score'],
                mode='markers',
                name='Headline Length vs Sentiment',
                marker=dict(
                    size=8,
                    color=df['score'],
                    colorscale='Viridis',
                    showscale=True,
                    colorbar=dict(
                        title="Sentiment Score",
                        x=1.05,  # Positioning to the right of the subplot
                        y=0.5,
                        len=0.5,
                        thickness=15
                    )
                )
            ),
            row=3, col=2
        )
        fig.update_xaxes(title_text="Headline Length (characters)", row=3, col=2)
        fig.update_yaxes(title_text="Sentiment Score", row=3, col=2)


    @staticmethod
    def _add_time_of_day_sentiment(fig, df):
        """Add time of day sentiment visualization"""
        hourly_sentiment = df.groupby('hour')['score'].mean()

        fig.add_trace(
            go.Scatter(
                x=hourly_sentiment.index,
                y=hourly_sentiment.values,
                mode='lines+markers',
                name='Hourly Sentiment',
                line=dict(shape='spline', color='orangered')
            ),
            row=3, col=3
        )
        fig.update_xaxes(title_text="Hour of Day", row=3, col=3)
        fig.update_yaxes(title_text="Average Sentiment Score", row=3, col=3)

    @staticmethod
    def create_visualizations(db_name: str):
        """Create enhanced visualizations"""
        with sqlite3.connect(db_name) as conn:
            df = pd.read_sql_query("SELECT * FROM sentiment_scores", conn)

        if df.empty:
            logging.warning("No data available for visualization")
            return

        # Convert dates and create hour column
        df['date'] = pd.to_datetime(df['date'])
        df['hour'] = pd.to_datetime(df['time']).dt.hour

        # Create figure with subplots
        fig = make_subplots(
            rows=3, cols=3,
            subplot_titles=(
                'Daily Entry Counts',
                'Hourly Distribution',
                'Sentiment Timeline',
                'Summary Length Distribution',
                'Sentiment Distribution',
                'Weekly Patterns',
                'Sentiment Moving Average',
                'Headline Length vs Sentiment',
                'Time of Day Sentiment'
            ),
            specs=[[{'type': 'scatter'}, {'type': 'bar'}, {'type': 'scatter'}],
                   [{'type': 'histogram'}, {'type': 'histogram'}, {'type': 'heatmap'}],
                   [{'type': 'scatter'}, {'type': 'scatter'}, {'type': 'scatter'}]],
            horizontal_spacing=0.05,
            vertical_spacing=0.1
        )

        # Add all visualizations
        DataVisualizer._add_daily_counts(fig, df)
        DataVisualizer._add_hourly_distribution(fig, df)
        DataVisualizer._add_sentiment_timeline(fig, df)
        DataVisualizer._add_summary_distribution(fig, df)
        DataVisualizer._add_sentiment_distribution(fig, df)
        DataVisualizer._add_weekly_patterns(fig, df)
        DataVisualizer._add_sentiment_moving_average(fig, df)
        DataVisualizer._add_headline_length_vs_sentiment(fig, df)
        DataVisualizer._add_time_of_day_sentiment(fig, df)

        # Update layout
        fig.update_layout(
            height=1200,
            width=1600,
            showlegend=True,
            title_text="Enhanced News Analysis Dashboard",
            template="plotly_white"
        )

        # Show plot
        fig.show()

def analyze_headlines():
    """Analyze and display recent headlines with their sentiment scores"""
    try:
        db_manager = DatabaseManager(DATABASE_NAME)

        # Get headline statistics
        stats = db_manager.get_headline_stats()

        # Get recent headlines
        recent_headlines = db_manager.get_headlines_with_scores(limit=10)
        most_positive, most_negative = db_manager.get_extreme_sentiment_headlines()

        # Create a figure for headlines analysis
        fig = make_subplots(
            rows=3, cols=1,
            subplot_titles=(
                'Recent Headlines with Sentiment Scores',
                'Most Positive Headlines',
                'Most Negative Headlines'
            ),
            specs=[[{"type": "table"}],
                  [{"type": "table"}],
                  [{"type": "table"}]],
            vertical_spacing=0.1
        )

        # Add statistics as annotations
        fig.add_annotation(
            text=(f"Total Headlines: {stats.get('total_headlines', 'N/A')} | "
                  f"Average Sentiment: {stats.get('average_sentiment', 0):.2f} | "
                  f"Date Range: {stats.get('date_range', 'N/A')}"),
            xref="paper", yref="paper",
            x=0, y=1.1,
            showarrow=False,
            font=dict(size=12)
        )

        # Add recent headlines table
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Date', 'Time', 'Headline', 'Sentiment Score'],
                    fill_color='paleturquoise',
                    align='left',
                    font=dict(size=12)
                ),
                cells=dict(
                    values=[recent_headlines['date'],
                           recent_headlines['time'],
                           recent_headlines['title'],
                           recent_headlines['score'].round(2)],
                    fill_color='lavender',
                    align='left',
                    font=dict(size=11)
                )
            ),
            row=1, col=1
        )

        # Add most positive headlines
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Date', 'Time', 'Headline', 'Sentiment Score'],
                    fill_color='lightgreen',
                    align='left'
                ),
                cells=dict(
                    values=[most_positive['date'],
                           most_positive['time'],
                           most_positive['title'],
                           most_positive['score'].round(2)],
                    fill_color='honeydew',
                    align='left'
                )
            ),
            row=2, col=1
        )

        # Add most negative headlines
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Date', 'Time', 'Headline', 'Sentiment Score'],
                    fill_color='lightcoral',
                    align='left'
                ),
                cells=dict(
                    values=[most_negative['date'],
                           most_negative['time'],
                           most_negative['title'],
                           most_negative['score'].round(2)],
                    fill_color='mistyrose',
                    align='left'
                )
            ),
            row=3, col=1
        )

        # Update layout
        fig.update_layout(
            height=1000,
            title_text="Headlines Analysis Dashboard",
            showlegend=False
        )

        # Show plot
        fig.show()

    except Exception as e:
        logging.error(f"Error analyzing headlines: {e}")
        raise
    finally:
        db_manager.close()

def run_analysis():
    try:
        logging.info("Starting analysis...")

        # Generate visualizations
        logging.info("Generating main dashboard...")
        DataVisualizer.create_visualizations(DATABASE_NAME)

        # Analyze headlines
        logging.info("Analyzing headlines...")
        analyze_headlines()

        logging.info("Analysis complete!")

    except Exception as e:
        logging.error(f"Error during analysis: {e}")
        raise
if __name__ == "__main__":
    run_analysis()