<a href="https://colab.research.google.com/github/abhy-kumar/NLPulse/blob/main/NLPulse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install feedparser TextBlob
!pip install nltk transformers torch



In [14]:
import sqlite3
from difflib import SequenceMatcher
import pandas as pd
from typing import List, Tuple
import logging

def initialize_database(db_path: str = '/content/news_sentiment.db') -> None:
    """Create the database and required tables if they don't exist."""
    try:
        with sqlite3.connect(db_path) as conn:
            conn.execute('''
                CREATE TABLE IF NOT EXISTS sentiment_scores (
                    date TEXT,
                    time TEXT,
                    title TEXT,
                    summary TEXT,
                    score REAL
                )
            ''')
            # Add indexes for better query performance
            conn.execute('CREATE INDEX IF NOT EXISTS idx_date ON sentiment_scores(date)')
            conn.execute('CREATE INDEX IF NOT EXISTS idx_title ON sentiment_scores(title)')
            conn.execute('CREATE INDEX IF NOT EXISTS idx_score ON sentiment_scores(score)')

        logging.info(f"Database initialized successfully at {db_path}")
    except Exception as e:
        logging.error(f"Error initializing database: {e}")
        raise

def calculate_similarity(text1: str, text2: str) -> float:
    """Calculate similarity ratio between two texts using SequenceMatcher."""
    return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()

def find_similar_entries(conn: sqlite3.Connection, similarity_threshold: float = 0.85) -> List[Tuple[int, int, float]]:
    """Find pairs of similar entries in the database."""
    # Get all entries
    df = pd.read_sql_query("SELECT rowid, title, summary FROM sentiment_scores", conn)
    similar_pairs = []

    # Compare each entry with others
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            # Check title similarity
            title_similarity = calculate_similarity(df.iloc[i]['title'], df.iloc[j]['title'])

            # If titles are similar, check summary similarity
            if title_similarity > similarity_threshold:
                summary_similarity = calculate_similarity(
                    df.iloc[i]['summary'] or '',
                    df.iloc[j]['summary'] or ''
                )

                # If both title and summary are similar, add to pairs
                if summary_similarity > similarity_threshold:
                    similar_pairs.append((
                        df.iloc[i]['rowid'],
                        df.iloc[j]['rowid'],
                        (title_similarity + summary_similarity) / 2
                    ))

    return similar_pairs

def remove_duplicates(db_path: str = '/content/news_sentiment.db', similarity_threshold: float = 0.85) -> None:
    """Remove duplicate or very similar entries from the database."""
    try:
        with sqlite3.connect(db_path) as conn:
            # Find similar pairs
            similar_pairs = find_similar_entries(conn, similarity_threshold)

            if not similar_pairs:
                print("No duplicate or similar entries found.")
                return

            # Get entries to remove (keep the earlier entry in each pair)
            entries_to_remove = set()
            for _, later_entry, similarity in similar_pairs:
                entries_to_remove.add(later_entry)

            # Remove duplicates
            cursor = conn.cursor()
            cursor.execute(
                "DELETE FROM sentiment_scores WHERE rowid IN ({})".format(
                    ','.join('?' * len(entries_to_remove))
                ),
                tuple(entries_to_remove)
            )

            print(f"Removed {len(entries_to_remove)} duplicate/similar entries.")
            print(f"Remaining entries: {cursor.execute('SELECT COUNT(*) FROM sentiment_scores').fetchone()[0]}")

            # Optimize database
            conn.execute("VACUUM")

    except Exception as e:
        logging.error(f"Error removing duplicates: {e}")
        raise

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Initialize database and remove duplicates
def setup_and_clean_database(db_path: str = '/content/news_sentiment.db'):
    """Initialize database if it doesn't exist and clean up duplicates."""
    initialize_database(db_path)
    remove_duplicates(db_path)

if __name__ == "__main__":
    setup_and_clean_database()

ERROR:root:Error removing duplicates: cannot VACUUM from within a transaction


Removed 3 duplicate/similar entries.
Remaining entries: 755


OperationalError: cannot VACUUM from within a transaction

In [16]:
import nltk
import torch
import requests
import feedparser
import sqlite3
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline
)
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
import logging
from typing import List, Tuple, Set
from wordcloud import WordCloud
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
DATABASE_NAME = 'news_sentiment.db'
NEWS_FEED_URL = "https://timesofindia.indiatimes.com/rssfeedstopstories.cms"
DATE_FORMAT = "%Y-%m-%d"
DATETIME_FORMAT = "%a, %d %b %Y %H:%M:%S %z"

class DatabaseManager:
    def __init__(self, db_name: str):
        self.db_name = db_name
        self._setup_database()
        self.connection = None
        self._setup_connection()

    def _setup_database(self):
        with sqlite3.connect(self.db_name) as conn:
            conn.execute('''
                CREATE TABLE IF NOT EXISTS sentiment_scores (
                    date TEXT,
                    time TEXT,
                    title TEXT,
                    summary TEXT,
                    score REAL
                )
            ''')
            # Add indexes for better query performance
            conn.execute('CREATE INDEX IF NOT EXISTS idx_date ON sentiment_scores(date)')
            conn.execute('CREATE INDEX IF NOT EXISTS idx_title ON sentiment_scores(title)')
            conn.execute('CREATE INDEX IF NOT EXISTS idx_score ON sentiment_scores(score)')

    def _setup_connection(self):
        """Setup persistent connection with optimized parameters"""
        if not self.connection:
            self.connection = sqlite3.connect(self.db_name, check_same_thread=False)
            self.connection.execute('PRAGMA journal_mode=WAL')  # Write-Ahead Logging
            self.connection.execute('PRAGMA synchronous=NORMAL')
            self.connection.execute('PRAGMA cache_size=-2000')  # 2MB cache
            self.connection.execute('PRAGMA temp_store=MEMORY')

    def store_score(self, date: str, time: str, title: str, summary: str, score: float):
        try:
            self.connection.execute(
                "INSERT INTO sentiment_scores VALUES (?, ?, ?, ?, ?)",
                (date, time, title, summary, score)
            )
            self.connection.commit()
        except Exception as e:
            logging.error(f"Error storing score: {e}")

    @lru_cache(maxsize=128)
    def get_daily_average(self, date: str) -> float:
        try:
            result = self.connection.execute(
                "SELECT AVG(score) FROM sentiment_scores WHERE date = ?",
                (date,)
            ).fetchone()
            return result[0] if result[0] is not None else 0.0
        except Exception as e:
            logging.error(f"Error getting daily average: {e}")
            return 0.0

    def get_headlines_with_scores(self, limit: int = 10) -> pd.DataFrame:
        """Retrieve recent headlines with their sentiment scores"""
        try:
            query = """
                SELECT date, time, title, score
                FROM sentiment_scores
                ORDER BY date DESC, time DESC
                LIMIT ?
            """
            return pd.read_sql_query(query, self.connection, params=(limit,))
        except Exception as e:
            logging.error(f"Error retrieving headlines: {e}")
            return pd.DataFrame()

    def get_extreme_sentiment_headlines(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Get headlines with highest and lowest sentiment scores"""
        try:
            most_positive = pd.read_sql_query("""
                SELECT date, time, title, score
                FROM sentiment_scores
                ORDER BY score DESC
                LIMIT 5
            """, self.connection)

            most_negative = pd.read_sql_query("""
                SELECT date, time, title, score
                FROM sentiment_scores
                ORDER BY score ASC
                LIMIT 5
            """, self.connection)

            return most_positive, most_negative
        except Exception as e:
            logging.error(f"Error retrieving extreme headlines: {e}")
            return pd.DataFrame(), pd.DataFrame()

    def get_headline_stats(self) -> dict:
        """Get statistical information about headlines"""
        try:
            stats = {}

            # Get average sentiment score
            avg_query = "SELECT AVG(score) FROM sentiment_scores"
            stats['average_sentiment'] = self.connection.execute(avg_query).fetchone()[0]

            # Get total number of headlines
            count_query = "SELECT COUNT(*) FROM sentiment_scores"
            stats['total_headlines'] = self.connection.execute(count_query).fetchone()[0]

            # Get date range
            range_query = """
                SELECT
                    MIN(date) as earliest_date,
                    MAX(date) as latest_date
                FROM sentiment_scores
            """
            earliest, latest = self.connection.execute(range_query).fetchone()
            stats['date_range'] = f"{earliest} to {latest}"

            return stats
        except Exception as e:
            logging.error(f"Error getting headline stats: {e}")
            return {}

    def close(self):
        if self.connection:
            self.connection.close()

class SentimentAnalyzer:
    def __init__(self):
        self._initialize_models()
        self._setup_device()

    def _initialize_models(self):
        # Download NLTK data only if not already present
        for resource in ['vader_lexicon', 'punkt', 'stopwords']:
            try:
                nltk.data.find(f'tokenizers/{resource}')
            except LookupError:
                nltk.download(resource, quiet=True)

        self.sia = SentimentIntensityAnalyzer()

        # Initialize models with better memory management
        self.finbert_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        self.finbert_model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        self.roberta_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
        self.roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

    def _setup_device(self):
        """Setup device and move models to it"""
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.finbert_model.to(self.device)
        self.roberta_model.to(self.device)

    @lru_cache(maxsize=1024)
    def analyze_sentiment(self, text: str) -> float:
        if not text or len(text.strip()) < 10:
            return 5.0

        try:
            # Run sentiment analysis in parallel
            with ThreadPoolExecutor(max_workers=3) as executor:
                vader_future = executor.submit(self._get_vader_score, text)
                finbert_future = executor.submit(self._get_finbert_score, text)
                roberta_future = executor.submit(self._get_roberta_score, text)

                vader_score, vader_confidence = vader_future.result()
                finbert_score, finbert_confidence = finbert_future.result()
                roberta_score, roberta_confidence = roberta_future.result()

            # Calculate weighted average
            weights = np.array([vader_confidence, finbert_confidence, roberta_confidence])
            weights = weights / weights.sum()  # Normalize weights
            scores = np.array([vader_score, finbert_score, roberta_score])

            combined_score = np.dot(weights, scores)
            return max(0, min(combined_score, 10))

        except Exception as e:
            logging.error(f"Error in sentiment analysis: {e}")
            return 5.0

    def _get_vader_score(self, text: str) -> Tuple[float, float]:
        scores = self.sia.polarity_scores(text)
        return (scores['compound'] + 1) * 5, abs(scores['compound'])

    def _get_finbert_score(self, text: str) -> Tuple[float, float]:
        with torch.no_grad():
            inputs = self.finbert_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
            outputs = self.finbert_model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=1)
            sentiment = torch.argmax(probs).item()
            return sentiment * 5, torch.max(probs).item()

    def _get_roberta_score(self, text: str) -> Tuple[float, float]:
        with torch.no_grad():
            inputs = self.roberta_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(self.device)
            outputs = self.roberta_model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits, dim=1)
            sentiment = torch.argmax(probs).item()
            return sentiment * 5, torch.max(probs).item()


class DataVisualizer:
    @staticmethod
    def create_visualizations(db_name: str):
        """Create enhanced visualizations with proper layout"""
        with sqlite3.connect(db_name) as conn:
            df = pd.read_sql_query("SELECT * FROM sentiment_scores", conn)

        if df.empty:
            logging.warning("No data available for visualization")
            return

        # Convert dates and create hour column
        df['date'] = pd.to_datetime(df['date'])
        df['hour'] = pd.to_datetime(df['time']).dt.hour

        # Create figure with subplots
        fig = make_subplots(
            rows=3, cols=3,
            subplot_titles=(
                'Daily Entry Counts',
                'Hourly Distribution',
                'Sentiment Timeline',
                'Summary Length Distribution',
                'Sentiment Distribution',
                'Weekly Patterns',
                'Sentiment Moving Average',
                'Headline Length vs Sentiment',
                'Time of Day Sentiment'
            ),
            specs=[
                [{'type': 'scatter'}, {'type': 'bar'}, {'type': 'scatter'}],
                [{'type': 'histogram'}, {'type': 'histogram'}, {'type': 'heatmap'}],
                [{'type': 'scatter'}, {'type': 'scatter'}, {'type': 'scatter'}]
            ],
            horizontal_spacing=0.12,  # Increased spacing between columns
            vertical_spacing=0.15     # Increased spacing between rows
        )

        # Daily Entry Counts
        daily_counts = df.groupby('date').size()
        fig.add_trace(
            go.Scatter(
                x=daily_counts.index,
                y=daily_counts.values,
                mode='lines+markers',
                name='Daily Entries',
                line=dict(width=2, color='royalblue'),
                marker=dict(size=6)
            ),
            row=1, col=1
        )

        # Hourly Distribution
        hourly_counts = df['hour'].value_counts().sort_index()
        fig.add_trace(
            go.Bar(
                x=hourly_counts.index,
                y=hourly_counts.values,
                name='Hourly Distribution',
                marker_color='lightblue'
            ),
            row=1, col=2
        )

        # Sentiment Timeline
        daily_sentiment = df.groupby('date')['score'].mean()
        fig.add_trace(
            go.Scatter(
                x=daily_sentiment.index,
                y=daily_sentiment.values,
                mode='lines',
                name='Daily Sentiment',
                line=dict(color='green', width=2)
            ),
            row=1, col=3
        )

        # Summary Length Distribution
        fig.add_trace(
            go.Histogram(
                x=df['summary'].str.len(),
                name='Summary Lengths',
                nbinsx=30,
                marker_color='lightgreen'
            ),
            row=2, col=1
        )

        # Sentiment Distribution
        fig.add_trace(
            go.Histogram(
                x=df['score'],
                name='Sentiment Distribution',
                nbinsx=20,
                histnorm='probability',
                marker_color='coral'
            ),
            row=2, col=2
        )

        # Weekly Patterns
        df['weekday'] = pd.to_datetime(df['date']).dt.day_name()
        weekly_sentiment = df.pivot_table(
            values='score',
            index='weekday',
            columns='hour',
            aggfunc='mean'
        )

        # Ensure consistent weekday order
        weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        weekly_sentiment = weekly_sentiment.reindex(weekday_order)

        fig.add_trace(
            go.Heatmap(
                z=weekly_sentiment.values,
                x=weekly_sentiment.columns,
                y=weekly_sentiment.index,
                colorscale='RdYlBu',
                name='Weekly Patterns',
                colorbar=dict(
                    title="Sentiment",
                    thickness=10,
                    len=0.3,
                    yanchor="middle",
                    y=0.5,
                    xanchor="left",
                    x=1.02
                )
            ),
            row=2, col=3
        )

        # Sentiment Moving Average
        daily_sentiment = df.groupby('date')['score'].mean().reset_index()
        daily_sentiment['MA7'] = daily_sentiment['score'].rolling(window=7).mean()
        fig.add_trace(
            go.Scatter(
                x=daily_sentiment['date'],
                y=daily_sentiment['MA7'],
                mode='lines',
                name='7-Day Moving Average',
                line=dict(color='purple', width=2)
            ),
            row=3, col=1
        )

        # Headline Length vs Sentiment
        df['title_length'] = df['title'].str.len()
        fig.add_trace(
            go.Scatter(
                x=df['title_length'],
                y=df['score'],
                mode='markers',
                name='Length vs Sentiment',
                marker=dict(
                    size=6,
                    color=df['score'],
                    colorscale='Viridis',
                    showscale=False  # Removed redundant colorbar
                )
            ),
            row=3, col=2
        )

        # Time of Day Sentiment
        hourly_sentiment = df.groupby('hour')['score'].mean()
        fig.add_trace(
            go.Scatter(
                x=hourly_sentiment.index,
                y=hourly_sentiment.values,
                mode='lines+markers',
                name='Hourly Sentiment',
                line=dict(shape='spline', color='orangered', width=2)
            ),
            row=3, col=3
        )

        # Update layout and axes
        fig.update_layout(
            height=1200,
            width=1600,
            showlegend=True,
            legend=dict(
                orientation="h",
                yanchor="bottom",
                y=-0.2,
                xanchor="center",
                x=0.5
            ),
            title_text="Enhanced News Analysis Dashboard",
            template="plotly_white"
        )

        # Update all x and y axis labels
        axis_labels = {
            (1,1): ('Date', 'Number of Articles'),
            (1,2): ('Hour of Day', 'Number of Articles'),
            (1,3): ('Date', 'Average Sentiment Score'),
            (2,1): ('Summary Length (characters)', 'Frequency'),
            (2,2): ('Sentiment Score', 'Probability'),
            (2,3): ('Hour of Day', 'Day of Week'),
            (3,1): ('Date', '7-Day Moving Average'),
            (3,2): ('Headline Length (characters)', 'Sentiment Score'),
            (3,3): ('Hour of Day', 'Average Sentiment Score')
        }

        for (row, col), (xlabel, ylabel) in axis_labels.items():
            fig.update_xaxes(title_text=xlabel, row=row, col=col)
            fig.update_yaxes(title_text=ylabel, row=row, col=col)

        # Show plot
        fig.show()
class NewsFetcher:
    def __init__(self, db_manager, sentiment_analyzer):
        self.db_manager = db_manager
        self.sentiment_analyzer = sentiment_analyzer
        self.processed_titles = set()  # Track processed articles to avoid duplicates

    def fetch_and_process_news(self):
        """Fetch news from RSS feed and process them"""
        try:
            logging.info("Fetching news from RSS feed...")
            feed = feedparser.parse(NEWS_FEED_URL)

            for entry in feed.entries:
                # Skip if already processed
                if entry.title in self.processed_titles:
                    continue

                try:
                    # Parse date and time
                    published = datetime.strptime(entry.published, DATETIME_FORMAT)
                    date = published.strftime(DATE_FORMAT)
                    time = published.strftime("%H:%M:%S")

                    # Analyze sentiment
                    combined_text = f"{entry.title} {entry.summary}"
                    sentiment_score = self.sentiment_analyzer.analyze_sentiment(combined_text)

                    # Store in database
                    self.db_manager.store_score(
                        date=date,
                        time=time,
                        title=entry.title,
                        summary=entry.summary,
                        score=sentiment_score
                    )

                    # Mark as processed
                    self.processed_titles.add(entry.title)

                    logging.info(f"Processed article: {entry.title[:50]}...")

                except Exception as e:
                    logging.error(f"Error processing entry: {e}")
                    continue

        except Exception as e:
            logging.error(f"Error fetching news: {e}")
            raise

# Modified run_analysis function
def run_analysis():
    try:
        logging.info("Starting analysis...")

        # Initialize components
        db_manager = DatabaseManager(DATABASE_NAME)
        sentiment_analyzer = SentimentAnalyzer()
        news_fetcher = NewsFetcher(db_manager, sentiment_analyzer)

        # Fetch and process new articles
        logging.info("Fetching and processing news...")
        news_fetcher.fetch_and_process_news()

        # Generate visualizations
        logging.info("Generating main dashboard...")
        DataVisualizer.create_visualizations(DATABASE_NAME)

        # Analyze headlines
        logging.info("Analyzing headlines...")
        analyze_headlines()

        logging.info("Analysis complete!")

    except Exception as e:
        logging.error(f"Error during analysis: {e}")
        raise
    finally:
        if 'db_manager' in locals():
            db_manager.close()
def analyze_headlines():
    """Analyze and display recent headlines with their sentiment scores"""
    try:
        db_manager = DatabaseManager(DATABASE_NAME)

        # Get headline statistics
        stats = db_manager.get_headline_stats()

        # Get recent headlines
        recent_headlines = db_manager.get_headlines_with_scores(limit=10)
        most_positive, most_negative = db_manager.get_extreme_sentiment_headlines()

        # Create a figure for headlines analysis
        fig = make_subplots(
            rows=3, cols=1,
            subplot_titles=(
                'Recent Headlines with Sentiment Scores',
                'Most Positive Headlines',
                'Most Negative Headlines'
            ),
            specs=[[{"type": "table"}],
                  [{"type": "table"}],
                  [{"type": "table"}]],
            vertical_spacing=0.1
        )

        # Add statistics as annotations
        fig.add_annotation(
            text=(f"Total Headlines: {stats.get('total_headlines', 'N/A')} | "
                  f"Average Sentiment: {stats.get('average_sentiment', 0):.2f} | "
                  f"Date Range: {stats.get('date_range', 'N/A')}"),
            xref="paper", yref="paper",
            x=0, y=1.1,
            showarrow=False,
            font=dict(size=12)
        )

        # Add recent headlines table
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Date', 'Time', 'Headline', 'Sentiment Score'],
                    fill_color='paleturquoise',
                    align='left',
                    font=dict(size=12)
                ),
                cells=dict(
                    values=[recent_headlines['date'],
                           recent_headlines['time'],
                           recent_headlines['title'],
                           recent_headlines['score'].round(2)],
                    fill_color='lavender',
                    align='left',
                    font=dict(size=11)
                )
            ),
            row=1, col=1
        )

        # Add most positive headlines
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Date', 'Time', 'Headline', 'Sentiment Score'],
                    fill_color='lightgreen',
                    align='left'
                ),
                cells=dict(
                    values=[most_positive['date'],
                           most_positive['time'],
                           most_positive['title'],
                           most_positive['score'].round(2)],
                    fill_color='honeydew',
                    align='left'
                )
            ),
            row=2, col=1
        )

        # Add most negative headlines
        fig.add_trace(
            go.Table(
                header=dict(
                    values=['Date', 'Time', 'Headline', 'Sentiment Score'],
                    fill_color='lightcoral',
                    align='left'
                ),
                cells=dict(
                    values=[most_negative['date'],
                           most_negative['time'],
                           most_negative['title'],
                           most_negative['score'].round(2)],
                    fill_color='mistyrose',
                    align='left'
                )
            ),
            row=3, col=1
        )

        # Update layout
        fig.update_layout(
            height=1000,
            title_text="Headlines Analysis Dashboard",
            showlegend=False
        )

        # Show plot
        fig.show()

    except Exception as e:
        logging.error(f"Error analyzing headlines: {e}")
        raise
    finally:
        db_manager.close()


if __name__ == "__main__":
    run_analysis()