In [None]:
# -*- coding: utf-8 -*-
"""
Professional Twitter Sentiment Analysis System
============================================

This notebook provides a high-accuracy sentiment analysis of Twitter profiles and hashtags
using state-of-the-art models and optimized processing techniques.

Features:
- Memory-efficient data processing with batch operations
- Ensemble of pre-trained transformer models for high accuracy
- Optimized classical ML models with feature engineering
- Fast execution with GPU acceleration
- Comprehensive visualizations
- Professional error handling

Author: Advanced AI Coder
"""

# @title 1Ô∏è‚É£ Install Required Libraries
# -----------------------------
# Install all necessary libraries for the project
# -----------------------------
!pip install -q transformers torch scikit-learn matplotlib seaborn wordcloud pandas numpy
!pip install -q emoji langdetect tweet-preprocessor
!pip install -q snscrape tweepy
!pip install -q vaderSentiment textblob
!pip install -q kaggle

# Import libraries
import os
import re
import time
import emoji
import logging
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    pipeline, TextClassificationPipeline
)
import torch
from torch.utils.data import Dataset
import preprocessor as tweet_preprocessor
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import warnings
import joblib
import gc
import ipywidgets as widgets
from IPython.display import display, clear_output

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Suppress warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# @title 2Ô∏è‚É£ Define All Helper Functions
# -----------------------------
# Define all helper functions upfront to avoid any NameError issues
# -----------------------------
def preprocess_text(text):
    """Clean and preprocess text"""
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs, mentions, hashtags, etc.
    text = tweet_preprocessor.clean(text)

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def extract_features(text):
    """Extract features from text"""
    if not isinstance(text, str) or text == "":
        # Return default features for empty text
        return {
            'vader_neg': 0.0,
            'vader_neu': 1.0,
            'vader_pos': 0.0,
            'vader_compound': 0.0,
            'textblob_polarity': 0.0,
            'textblob_subjectivity': 0.0,
            'char_count': 0,
            'word_count': 0,
            'avg_word_length': 0.0
        }

    # VADER sentiment scores
    vader = SentimentIntensityAnalyzer()
    vader_scores = vader.polarity_scores(text)

    # TextBlob sentiment
    blob = TextBlob(text)
    textblob_polarity = blob.sentiment.polarity
    textblob_subjectivity = blob.sentiment.subjectivity

    # Text statistics
    char_count = len(text)
    word_count = len(text.split())
    avg_word_length = np.mean([len(word) for word in text.split()]) if word_count > 0 else 0

    # Create feature dictionary
    features = {
        'vader_neg': vader_scores['neg'],
        'vader_neu': vader_scores['neu'],
        'vader_pos': vader_scores['pos'],
        'vader_compound': vader_scores['compound'],
        'textblob_polarity': textblob_polarity,
        'textblob_subjectivity': textblob_subjectivity,
        'char_count': char_count,
        'word_count': word_count,
        'avg_word_length': avg_word_length
    }

    return features

def sparse_to_dense_batches(sparse_matrix, batch_size=1000):
    """Convert sparse matrix to dense in batches to save memory"""
    dense_matrix = np.zeros((sparse_matrix.shape[0], sparse_matrix.shape[1]))

    for i in range(0, sparse_matrix.shape[0], batch_size):
        batch_end = min(i + batch_size, sparse_matrix.shape[0])
        dense_matrix[i:batch_end] = sparse_matrix[i:batch_end].toarray()

        # Clean up memory
        gc.collect()

    return dense_matrix

# @title 3Ô∏è‚É£ Set Up Kaggle API
# -----------------------------
# Set up Kaggle API credentials
# -----------------------------
# Create kaggle directory if it doesn't exist
!mkdir -p ~/.kaggle

# Save Kaggle API credentials
kaggle_creds = '{"username":"aryanpatelfhdbfj","key":"9f2639e4747f6271f3702cd3969f93f1"}'
with open('/root/.kaggle/kaggle.json', 'w') as f:
    f.write(kaggle_creds)

# Set permissions
!chmod 600 /root/.kaggle/kaggle.json

print("‚úÖ Kaggle API credentials set up successfully")

# @title 4Ô∏è‚É£ Download and Prepare Dataset (Optimized)
# -----------------------------
# Download Sentiment140 dataset from Kaggle
# -----------------------------
# Create data directory
!mkdir -p data

# Download dataset
print("üîÑ Downloading Sentiment140 dataset from Kaggle...")
!kaggle datasets download -d kazanova/sentiment140 -p data --force

# Extract dataset
print("üîÑ Extracting dataset...")
import zipfile
with zipfile.ZipFile('data/sentiment140.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

print("‚úÖ Dataset downloaded and extracted successfully")

# @title 5Ô∏è‚É£ Load and Process Data in Batches (Optimized)
# -----------------------------
# Load and process data in batches to avoid memory issues
# -----------------------------
print("üîÑ Loading dataset in batches...")

# Define batch size - optimized for memory efficiency
BATCH_SIZE = 25000  # Optimized batch size

# Read the dataset in chunks
cols = ['target', 'ids', 'date', 'flag', 'user', 'text']
chunks = pd.read_csv('data/training.1600000.processed.noemoticon.csv',
                    encoding='latin-1', header=None, names=cols,
                    chunksize=BATCH_SIZE)

# Process each chunk
processed_chunks = []
total_samples = 0
max_samples = 80000  # Optimized sample size for speed and memory

# Track class distribution
class_counts = {0: 0, 1: 0}
max_per_class = max_samples // 2  # Ensure balanced dataset

for i, chunk in enumerate(chunks):
    if total_samples >= max_samples:
        break

    print(f"Processing batch {i+1}...")

    # Map target to 0 and 1 (0 = negative, 1 = positive)
    chunk['target'] = chunk['target'].replace(4, 1)

    # Sample balanced dataset
    negative_samples = chunk[chunk['target'] == 0]
    positive_samples = chunk[chunk['target'] == 1]

    # Determine how many samples to take from each class
    neg_needed = max(0, max_per_class - class_counts[0])
    pos_needed = max(0, max_per_class - class_counts[1])

    # Sample from each class
    if neg_needed > 0 and len(negative_samples) > 0:
        neg_to_take = min(neg_needed, len(negative_samples))
        negative_samples = negative_samples.sample(n=neg_to_take, random_state=42)
        class_counts[0] += neg_to_take
    else:
        negative_samples = pd.DataFrame()

    if pos_needed > 0 and len(positive_samples) > 0:
        pos_to_take = min(pos_needed, len(positive_samples))
        positive_samples = positive_samples.sample(n=pos_to_take, random_state=42)
        class_counts[1] += pos_to_take
    else:
        positive_samples = pd.DataFrame()

    # Combine samples
    chunk = pd.concat([negative_samples, positive_samples], ignore_index=True)

    if len(chunk) == 0:
        continue

    # Preprocess text data
    print(f"  Preprocessing text in batch {i+1}...")
    chunk['clean_text'] = chunk['text'].apply(preprocess_text)

    # Remove empty texts
    chunk = chunk[chunk['clean_text'] != ''].reset_index(drop=True)

    # Add to processed chunks
    processed_chunks.append(chunk)
    total_samples += len(chunk)

    # Clean up memory
    del chunk, negative_samples, positive_samples
    gc.collect()

    print(f"  Processed {len(processed_chunks[-1])} samples (total: {total_samples})")
    print(f"  Class distribution: {class_counts}")

# Combine all processed chunks
df = pd.concat(processed_chunks, ignore_index=True)
print(f"‚úÖ Dataset loaded and processed: {len(df)} samples")
print(f"‚úÖ Final class distribution: {df['target'].value_counts().to_dict()}")

# Clean up
del processed_chunks
gc.collect()

# @title 6Ô∏è‚É£ Feature Engineering (Optimized)
# -----------------------------
# Extract features from text in batches
# -----------------------------
print("üîÑ Extracting features in batches...")
feature_batches = []
batch_size = 8000  # Optimized batch size for feature extraction

for i in range(0, len(df), batch_size):
    batch_end = min(i + batch_size, len(df))
    batch_texts = df.iloc[i:batch_end]['clean_text'].values

    print(f"  Processing features batch {i//batch_size + 1}/{(len(df)-1)//batch_size + 1}...")

    # Extract features for the batch
    batch_features = []
    for text in batch_texts:
        features = extract_features(text)
        batch_features.append(features)

    # Convert to DataFrame and add to batches
    feature_batches.append(pd.DataFrame(batch_features))

    # Clean up memory
    del batch_texts, batch_features
    gc.collect()

# Combine all feature batches
features_df = pd.concat(feature_batches, ignore_index=True)
print("‚úÖ Feature extraction complete")

# Clean up
del feature_batches
gc.collect()

# @title 7Ô∏è‚É£ Split Dataset (Optimized)
# -----------------------------
# Split dataset into training and validation sets
# -----------------------------
# Split data
X_text = df['clean_text']
X_features = features_df
y = df['target']

# Use stratified split to ensure balanced classes in train and test sets
X_text_train, X_text_val, X_features_train, X_features_val, y_train, y_val = train_test_split(
    X_text, X_features, y, test_size=0.2, random_state=42, stratify=y
)

print(f"‚úÖ Training data: {len(X_text_train)} samples")
print(f"‚úÖ Training class distribution: {y_train.value_counts().to_dict()}")
print(f"‚úÖ Validation data: {len(X_text_val)} samples")
print(f"‚úÖ Validation class distribution: {y_val.value_counts().to_dict()}")

# Clean up
del df, features_df
gc.collect()

# @title 8Ô∏è‚É£ Train Classical ML Models (Optimized)
# -----------------------------
# Train classical ML models with memory optimization
# -----------------------------
# Create TF-IDF vectorizer with optimized parameters
tfidf = TfidfVectorizer(max_features=2500, ngram_range=(1, 2), min_df=5)  # Optimized parameters

# Fit and transform training data
print("üîÑ Fitting TF-IDF vectorizer...")
X_text_train_tfidf = tfidf.fit_transform(X_text_train)
X_text_val_tfidf = tfidf.transform(X_text_val)

# Convert sparse matrices to dense in smaller batches to save memory
print("üîÑ Converting TF-IDF matrices to dense...")
X_text_train_dense = sparse_to_dense_batches(X_text_train_tfidf, batch_size=2000)
X_text_val_dense = sparse_to_dense_batches(X_text_val_tfidf, batch_size=2000)

# Combine TF-IDF features with other features
X_train_combined = np.hstack((X_text_train_dense, X_features_train.values))
X_val_combined = np.hstack((X_text_val_dense, X_features_val.values))

# Clean up
del X_text_train_tfidf, X_text_val_tfidf, X_text_train_dense, X_text_val_dense
gc.collect()

# Train optimized models
models = {
    'Logistic Regression': LogisticRegression(max_iter=300, random_state=42, C=1.0),
    'Random Forest': RandomForestClassifier(n_estimators=30, random_state=42, max_depth=8, min_samples_split=5),
}

best_model = None
best_f1 = 0
best_model_name = ""

print("üîÑ Training classical ML models...")
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_combined, y_train)

    # Predict on validation set
    y_pred = model.predict(X_val_combined)

    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print(f"{name} - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")

    # Save best model
    if f1 > best_f1:
        best_f1 = f1
        best_model = model
        best_model_name = name

print(f"‚úÖ Best classical model ({best_model_name}) with F1: {best_f1:.4f}")

# Save the best model and TF-IDF vectorizer
!mkdir -p models
joblib.dump(best_model, 'models/best_classical_model.pkl')
joblib.dump(tfidf, 'models/tfidf_vectorizer.pkl')

print("‚úÖ Best classical model and TF-IDF vectorizer saved")

# Clean up
del X_train_combined, X_val_combined, X_features_train, X_features_val
gc.collect()

# @title 9Ô∏è‚É£ Load Transformer Models (Optimized)
# -----------------------------
# Load pre-trained transformer models with optimization
# -----------------------------
# Model names - using optimized selection
model_names = [
    'distilbert-base-uncased-finetuned-sst-2-english',
    'cardiffnlp/twitter-roberta-base-sentiment-latest',
]

# Load models with optimization
transformer_models = {}
tokenizers = {}
pipelines = {}

print("üîÑ Loading pre-trained transformer models...")
for i, model_name in enumerate(model_names):
    print(f"Loading {model_name}...")

    # Create pipeline for faster inference
    try:
        pipeline_obj = pipeline(
            "sentiment-analysis",
            model=model_name,
            tokenizer=model_name,
            device=0 if device.type == 'cuda' else -1,
            framework="pt"
        )
        pipelines[model_name] = pipeline_obj
        print(f"‚úÖ {model_name} pipeline loaded successfully")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not create pipeline for {model_name}: {str(e)}")

        # Fallback to manual loading
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        model.to(device)
        model.eval()

        transformer_models[model_name] = model
        tokenizers[model_name] = tokenizer

        print(f"‚úÖ {model_name} loaded successfully (manual)")

    # Clean up memory before loading next model
    if i < len(model_names) - 1:
        gc.collect()
        torch.cuda.empty_cache()

# @title üîü Data Collection Functions (Optimized)
# -----------------------------
# Functions to collect tweets from Twitter
# -----------------------------
# Twitter API Bearer Token
BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAAIoF3wEAAAAAkKhBzYMZmegOlBZp0f075MvBEiA%3DiRAcUZut1As19bvpKVSzDVe4NXcjlM3lyDTurx54FvXqpPQuaD"

def fetch_tweets_with_api(query, max_tweets=100, days_ago=7):
    """Fetch tweets using Twitter API"""
    try:
        import tweepy

        # Set up Twitter API client
        client = tweepy.Client(BEARER_TOKEN)

        # Calculate start time
        start_time = datetime.datetime.now() - datetime.timedelta(days=days_ago)

        # Fetch tweets
        if query.startswith('#'):
            # Hashtag search
            tweets = client.search_recent_tweets(
                query=query,
                max_results=min(max_tweets, 100),
                tweet_fields=['created_at', 'public_metrics', 'author_id'],
                start_time=start_time
            )
        else:
            # User timeline
            user = client.get_user(username=query)
            if user.data is None:
                return None

            tweets = client.get_users_tweets(
                id=user.data.id,
                max_results=min(max_tweets, 100),
                tweet_fields=['created_at', 'public_metrics'],
                start_time=start_time
            )

        # Process tweets
        tweets_data = []
        if tweets.data:
            for tweet in tweets.data:
                tweets_data.append({
                    'id': tweet.id,
                    'date': tweet.created_at,
                    'text': tweet.text,
                    'likes': tweet.public_metrics.get('like_count', 0),
                    'retweets': tweet.public_metrics.get('retweet_count', 0),
                    'replies': tweet.public_metrics.get('reply_count', 0),
                    'quotes': tweet.public_metrics.get('quote_count', 0),
                    'source': 'api'
                })

        return pd.DataFrame(tweets_data)

    except Exception as e:
        print(f"‚ùå Error fetching tweets with API: {str(e)}")
        return None

def create_sample_tweets(query, max_tweets=100):
    """Create sample tweets for demonstration"""
    print("üîÑ Creating sample tweets...")

    # Sample positive and negative tweet templates
    positive_templates = [
        f"I love {query}!",
        f"{query} is amazing!",
        f"Just had a great experience with {query}!",
        f"{query} is the best!",
        f"I'm so happy with {query}!",
        f"{query} exceeded my expectations!",
        f"Highly recommend {query}!",
        f"{query} is fantastic!",
        f"I'm impressed with {query}!",
        f"{query} is wonderful!"
    ]

    negative_templates = [
        f"I hate {query}!",
        f"{query} is terrible!",
        f"Just had a bad experience with {query}!",
        f"{query} is the worst!",
        f"I'm so disappointed with {query}!",
        f"{query} did not meet my expectations!",
        f"Would not recommend {query}!",
        f"{query} is awful!",
        f"I'm unimpressed with {query}!",
        f"{query} is horrible!"
    ]

    # Create sample tweets
    tweets_data = []
    for i in range(max_tweets):
        if i % 2 == 0:
            template = np.random.choice(positive_templates)
            sentiment = 1
        else:
            template = np.random.choice(negative_templates)
            sentiment = 0

        # Create a date within the specified range
        date = datetime.datetime.now() - datetime.timedelta(days=np.random.randint(0, 7))

        tweets_data.append({
            'id': i + 1,
            'date': date,
            'text': template,
            'likes': np.random.randint(0, 1000),
            'retweets': np.random.randint(0, 500),
            'replies': np.random.randint(0, 100),
            'quotes': np.random.randint(0, 50),
            'source': 'sample',
            'actual_sentiment': sentiment
        })

    return pd.DataFrame(tweets_data)

def fetch_tweets(query, max_tweets=100, days_ago=7):
    """Fetch tweets with multiple fallback options"""
    # Try Twitter API first
    tweets_df = fetch_tweets_with_api(query, max_tweets, days_ago)
    if tweets_df is not None and len(tweets_df) > 0:
        print(f"‚úÖ Fetched {len(tweets_df)} tweets using Twitter API")
        return tweets_df

    # Last resort: create sample tweets
    print("‚ö†Ô∏è Could not fetch tweets, creating sample data")
    return create_sample_tweets(query, max_tweets)

# @title 1Ô∏è‚É£1Ô∏è‚É£ Sentiment Analysis Functions (Optimized)
# -----------------------------
# Functions to analyze sentiment of tweets
# -----------------------------
def predict_with_transformer_pipeline(text, model_name):
    """Predict sentiment using a transformer pipeline"""
    pipeline_obj = pipelines[model_name]

    # Get prediction
    result = pipeline_obj(text)

    # Extract sentiment and confidence
    if isinstance(result, list):
        result = result[0]

    label = result['label']
    score = result['score']

    # Map label to sentiment (0 = negative, 1 = positive)
    if label.lower() in ['negative', 'neg', 'label_0']:
        sentiment = 0
    else:
        sentiment = 1

    return sentiment, score

def predict_with_transformer_manual(text, model_name):
    """Predict sentiment using a transformer model manually"""
    model = transformer_models[model_name]
    tokenizer = tokenizers[model_name]

    # Tokenize text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        prediction = torch.argmax(logits, dim=-1).item()
        confidence = probabilities[0][prediction].item()

    # Map prediction to sentiment (0 = negative, 1 = positive)
    sentiment = prediction

    return sentiment, confidence

def predict_with_transformer(text, model_name):
    """Predict sentiment using a transformer model"""
    if model_name in pipelines:
        return predict_with_transformer_pipeline(text, model_name)
    else:
        return predict_with_transformer_manual(text, model_name)

def predict_with_classical(text):
    """Predict sentiment using the best classical model"""
    # Load the saved model and vectorizer
    model = joblib.load('models/best_classical_model.pkl')
    tfidf = joblib.load('models/tfidf_vectorizer.pkl')

    # Preprocess text
    clean_text = preprocess_text(text)

    # Extract features
    features = extract_features(clean_text)
    features_df = pd.DataFrame([features])

    # Vectorize text
    text_tfidf = tfidf.transform([clean_text])

    # Combine features
    combined_features = np.hstack((text_tfidf.toarray(), features_df.values))

    # Predict
    prediction = model.predict(combined_features)[0]
    probabilities = model.predict_proba(combined_features)[0]
    confidence = probabilities[prediction]

    return prediction, confidence

def analyze_sentiment_ensemble(text):
    """Analyze sentiment using ensemble of models"""
    # Get predictions from transformer models
    transformer_predictions = []
    transformer_confidences = []

    for model_name in model_names:
        sentiment, confidence = predict_with_transformer(text, model_name)
        transformer_predictions.append(sentiment)
        transformer_confidences.append(confidence)

    # Get prediction from classical model
    classical_sentiment, classical_confidence = predict_with_classical(text)

    # Calculate ensemble prediction (weighted average)
    # Give more weight to transformer models (70%) than classical model (30%)
    transformer_weight = 0.7 / len(model_names)
    classical_weight = 0.3

    # Calculate weighted sentiment
    weighted_sentiment = (
        sum(sentiment * conf * transformer_weight
            for sentiment, conf in zip(transformer_predictions, transformer_confidences)) +
        classical_sentiment * classical_confidence * classical_weight
    )

    # Normalize to 0 or 1
    ensemble_sentiment = 1 if weighted_sentiment >= 0.5 else 0

    # Calculate ensemble confidence (average of all confidences)
    ensemble_confidence = (
        sum(transformer_confidences) * transformer_weight +
        classical_confidence * classical_weight
    )

    return ensemble_sentiment, ensemble_confidence

# @title 1Ô∏è‚É£2Ô∏è‚É£ Visualization Functions (Optimized)
# -----------------------------
# Functions to visualize sentiment analysis results
# -----------------------------
class SentimentVisualizer:
    """Class for visualizing sentiment analysis results"""

    def __init__(self):
        # Set style
        sns.set(style="whitegrid")
        plt.rcParams.update({'font.size': 12})

    def plot_sentiment_distribution(self, df, title):
        """Plot sentiment distribution"""
        plt.figure(figsize=(10, 6))

        # Count sentiments
        sentiment_counts = df['sentiment'].value_counts()

        # Create pie chart
        plt.pie(
            sentiment_counts,
            labels=['Positive', 'Negative'],
            autopct='%1.1f%%',
            startangle=90,
            colors=['#1DA1F2', '#E0245E'],
            explode=(0.05, 0.05)
        )

        plt.title(title, fontsize=16, fontweight='bold')
        plt.axis('equal')
        plt.tight_layout()
        plt.show()

    def plot_sentiment_over_time(self, df, title):
        """Plot sentiment over time"""
        plt.figure(figsize=(12, 6))

        # Group by date and sentiment
        df['date'] = pd.to_datetime(df['date']).dt.date
        sentiment_by_date = df.groupby(['date', 'sentiment']).size().unstack(fill_value=0)

        # Plot line chart
        sentiment_by_date.plot(kind='line', figsize=(12, 6), marker='o')

        plt.title(title, fontsize=16, fontweight='bold')
        plt.xlabel('Date', fontsize=12)
        plt.ylabel('Number of Tweets', fontsize=12)
        plt.legend(['Negative', 'Positive'], title='Sentiment')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

    def plot_engagement_by_sentiment(self, df, title):
        """Plot engagement metrics by sentiment"""
        plt.figure(figsize=(12, 8))

        # Group by sentiment
        engagement_by_sentiment = df.groupby('sentiment').agg({
            'likes': 'mean',
            'retweets': 'mean',
            'replies': 'mean',
            'quotes': 'mean'
        }).reset_index()

        # Melt dataframe for plotting
        engagement_melted = pd.melt(
            engagement_by_sentiment,
            id_vars='sentiment',
            value_vars=['likes', 'retweets', 'replies', 'quotes'],
            var_name='metric',
            value_name='value'
        )

        # Create bar chart
        ax = sns.barplot(
            x='metric',
            y='value',
            hue='sentiment',
            data=engagement_melted,
            palette=['#E0245E', '#1DA1F2']
        )

        plt.title(title, fontsize=16, fontweight='bold')
        plt.xlabel('Engagement Metric', fontsize=12)
        plt.ylabel('Average Count', fontsize=12)
        plt.legend(['Negative', 'Positive'], title='Sentiment')

        # Add value labels on bars
        for p in ax.patches:
            ax.annotate(
                f"{p.get_height():.1f}",
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 5),
                textcoords='offset points'
            )

        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

    def create_wordcloud(self, df, sentiment, title):
        """Create word cloud for specific sentiment"""
        # Filter by sentiment
        sentiment_text = ' '.join(df[df['sentiment'] == sentiment]['clean_text'])

        # Create word cloud
        wordcloud = WordCloud(
            width=800,
            height=400,
            background_color='white',
            max_words=100,
            colormap='Blues' if sentiment == 1 else 'Reds'
        ).generate(sentiment_text)

        # Plot word cloud
        plt.figure(figsize=(10, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(title, fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.show()

    def plot_confidence_distribution(self, df, title):
        """Plot confidence distribution"""
        plt.figure(figsize=(12, 6))

        # Create histogram
        sns.histplot(data=df, x='confidence', hue='sentiment', bins=20, alpha=0.7, kde=True)

        plt.title(title, fontsize=16, fontweight='bold')
        plt.xlabel('Confidence Score', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.legend(['Negative', 'Positive'], title='Sentiment')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

# @title 1Ô∏è‚É£3Ô∏è‚É£ Main Analysis Function (Optimized)
# -----------------------------
# Main function to analyze sentiment of tweets
# -----------------------------
def analyze_sentiment(query, analysis_type, max_tweets=100, days_ago=7):
    """Main function to analyze sentiment of tweets"""
    print(f"\nTwitter Sentiment Analysis: {query}")
    print(f"Analysis Type: {analysis_type.capitalize()}")
    print(f"Analysis Date: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("-" * 50)

    # Fetch tweets
    print(f"üîÑ Fetching tweets for {query}...")
    tweets_df = fetch_tweets(query, max_tweets, days_ago)

    if len(tweets_df) == 0:
        print("‚ùå No tweets found. Please check the query and try again.")
        return

    print(f"‚úÖ Found {len(tweets_df)} tweets")

    # Preprocess text
    print("üîÑ Preprocessing text...")
    tweets_df['clean_text'] = tweets_df['text'].apply(preprocess_text)

    # Analyze sentiment
    print("üîÑ Analyzing sentiment...")
    sentiments = []
    confidences = []

    for _, row in tweets_df.iterrows():
        text = row['text']

        # Get ensemble prediction
        sentiment, confidence = analyze_sentiment_ensemble(text)
        sentiments.append(sentiment)
        confidences.append(confidence)

    # Add results to dataframe
    tweets_df['sentiment'] = sentiments
    tweets_df['confidence'] = confidences

    # Create visualizer
    visualizer = SentimentVisualizer()

    # Display summary statistics
    print("\nüìä Summary Statistics:")
    print(f"Total tweets: {len(tweets_df)}")
    print(f"Positive tweets: {len(tweets_df[tweets_df['sentiment'] == 1])} ({len(tweets_df[tweets_df['sentiment'] == 1]) / len(tweets_df) * 100:.1f}%)")
    print(f"Negative tweets: {len(tweets_df[tweets_df['sentiment'] == 0])} ({len(tweets_df[tweets_df['sentiment'] == 0]) / len(tweets_df) * 100:.1f}%)")
    print(f"Average confidence: {tweets_df['confidence'].mean():.4f}")

    # Display sample tweets
    print("\nüìù Sample Tweets:")
    positive_tweets = tweets_df[tweets_df['sentiment'] == 1].sort_values('confidence', ascending=False)
    negative_tweets = tweets_df[tweets_df['sentiment'] == 0].sort_values('confidence', ascending=False)

    print("\nMost Positive Tweets:")
    for i, (_, row) in enumerate(positive_tweets.head(3).iterrows()):
        print(f"{i+1}. {row['text']} (Confidence: {row['confidence']:.4f})")

    print("\nMost Negative Tweets:")
    for i, (_, row) in enumerate(negative_tweets.head(3).iterrows()):
        print(f"{i+1}. {row['text']} (Confidence: {row['confidence']:.4f})")

    # Create visualizations
    print("\nüìà Creating visualizations...")

    # Sentiment distribution
    visualizer.plot_sentiment_distribution(tweets_df, f"Sentiment Distribution for {query}")

    # Sentiment over time
    visualizer.plot_sentiment_over_time(tweets_df, f"Sentiment Over Time for {query}")

    # Engagement by sentiment
    visualizer.plot_engagement_by_sentiment(tweets_df, f"Engagement by Sentiment for {query}")

    # Word clouds
    visualizer.create_wordcloud(tweets_df, sentiment=1, title=f"Positive Words for {query}")
    visualizer.create_wordcloud(tweets_df, sentiment=0, title=f"Negative Words for {query}")

    # Confidence distribution
    visualizer.plot_confidence_distribution(tweets_df, f"Confidence Distribution for {query}")

    print("\n‚úÖ Analysis complete!")
    return tweets_df

# @title 1Ô∏è‚É£4Ô∏è‚É£ Interactive Analysis
# -----------------------------
# Interactive widgets for user input
# -----------------------------
# Create input widgets
analysis_type = widgets.RadioButtons(
    options=['Profile', 'Hashtag'],
    value='Profile',
    description='Analysis Type:',
    disabled=False,
    layout={'width': 'max-content'}
)

query_input = widgets.Text(
    value='elonmusk',
    placeholder='Enter Twitter username or hashtag',
    description='Input:',
    disabled=False,
    layout={'width': '500px'}
)

max_tweets_slider = widgets.IntSlider(
    value=100,
    min=10,
    max=500,
    step=10,
    description='Max Tweets:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

days_ago_slider = widgets.IntSlider(
    value=7,
    min=1,
    max=30,
    step=1,
    description='Days Ago:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

analyze_button = widgets.Button(
    description='Analyze',
    disabled=False,
    button_style='success',
    tooltip='Click to analyze',
    icon='check',
    layout={'width': '100px'}
)

output = widgets.Output()

# Define button click event
def on_analyze_button_clicked(b):
    with output:
        clear_output(wait=True)

        # Get input values
        query = query_input.value
        analysis_type_value = analysis_type.value.lower()
        max_tweets = max_tweets_slider.value
        days_ago = days_ago_slider.value

        # Format query based on analysis type
        if analysis_type_value == 'profile':
            if query.startswith('@'):
                query = query[1:]
        else:  # hashtag
            if not query.startswith('#'):
                query = '#' + query

        # Run analysis
        analyze_sentiment(query, analysis_type_value, max_tweets, days_ago)

# Register button click event
analyze_button.on_click(on_analyze_button_clicked)

# Display widgets
display(analysis_type)
display(query_input)
display(max_tweets_slider)
display(days_ago_slider)
display(analyze_button)
display(output)

[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m981.5/981.5 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m608.4/608.4 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m74.8/74.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m126.0/126.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda
‚úÖ Kaggle API credentials set up successfully
üî

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


‚úÖ distilbert-base-uncased-finetuned-sst-2-english pipeline loaded successfully
Loading cardiffnlp/twitter-roberta-base-sentiment-latest...


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


‚úÖ cardiffnlp/twitter-roberta-base-sentiment-latest pipeline loaded successfully


RadioButtons(description='Analysis Type:', layout=Layout(width='max-content'), options=('Profile', 'Hashtag'),‚Ä¶

Text(value='elonmusk', description='Input:', layout=Layout(width='500px'), placeholder='Enter Twitter username‚Ä¶

IntSlider(value=100, continuous_update=False, description='Max Tweets:', max=500, min=10, step=10)

IntSlider(value=7, continuous_update=False, description='Days Ago:', max=30, min=1)

Button(button_style='success', description='Analyze', icon='check', layout=Layout(width='100px'), style=Button‚Ä¶

Output()