In [None]:
# SENTIMENT ANALYSIS ON X DATASET

import pandas as pd
import numpy as np
import re
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Download VADER lexicon
nltk.download('vader_lexicon')


In [None]:
# Load dataset
file_path = "X data.csv"  # ensure file is in the same folder
try:
    df = pd.read_csv(file_path)
    print(f"✅ Dataset loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")
except FileNotFoundError:
    print("⚠️ File not found. Please check the file path or upload the dataset.")

print("\nAvailable Columns:", list(df.columns))
print("\nFirst 5 rows:\n", df.head())


In [None]:
# Clean text
def clean_tweet(text):
    text = re.sub(r"http\S+", "", str(text))     # remove URLs
    text = re.sub(r"@\w+", "", text)             # remove mentions
    text = re.sub(r"#", "", text)                 # remove hashtags
    text = re.sub(r"[^A-Za-z\s]", "", text)      # remove special chars
    return text.lower().strip()

# Note: Data already has 'clean_text' column, so we'll use that directly
# df['clean_text'] = df['text'].apply(clean_tweet)  # This line would cause KeyError

# If you want to re-clean the text, uncomment below:
# df['clean_text'] = df['clean_text'].apply(clean_tweet)

In [None]:
# Handle missing values in clean_text
print(f"Missing values in clean_text before cleaning: {df['clean_text'].isna().sum()}")

# Option 1: Drop rows with missing text (recommended if you have enough data)
df = df.dropna(subset=['clean_text'])

# Option 2: Alternative - Fill with empty string (if you want to keep all rows)
# df['clean_text'] = df['clean_text'].fillna('')

print(f"Missing values in clean_text after cleaning: {df['clean_text'].isna().sum()}")
print(f"Dataset shape after cleaning: {df.shape}")

# Sentiment analysis functions
vader_analyzer = SentimentIntensityAnalyzer()

def get_textblob_sentiment(text):
    try:
        # Handle empty strings and ensure we have valid text
        if not isinstance(text, str) or text.strip() == '':
            return 'Neutral'

        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        if polarity > 0.1:
            return 'Positive'
        elif polarity < -0.1:
            return 'Negative'
        else:
            return 'Neutral'
    except:
        return 'Neutral'

def get_vader_sentiment(text):
    try:
        # Handle empty strings and ensure we have valid text
        if not isinstance(text, str) or text.strip() == '':
            return 'Neutral'

        score = vader_analyzer.polarity_scores(text)['compound']
        if score >= 0.05:
            return 'Positive'
        elif score <= -0.05:
            return 'Negative'
        else:
            return 'Neutral'
    except:
        return 'Neutral'

# Apply sentiment analysis to the cleaned text column
df['textblob_sentiment'] = df['clean_text'].apply(get_textblob_sentiment)
df['vader_sentiment'] = df['clean_text'].apply(get_vader_sentiment)

print("\n✅ Sentiment columns added successfully!")

In [None]:
# Ground truth and model validation
def convert_ground_truth(cat):
    if cat == 1:
        return 'Positive'
    elif cat == -1:
        return 'Negative'
    else:
        return 'Neutral'

df['ground_truth'] = df['category'].apply(convert_ground_truth)

textblob_acc = (df['textblob_sentiment'] == df['ground_truth']).mean()
vader_acc = (df['vader_sentiment'] == df['ground_truth']).mean()

print(f"\n📊 TextBlob Accuracy: {textblob_acc*100:.2f}%")
print(f"📊 VADER Accuracy: {vader_acc*100:.2f}%")


In [None]:
# Visualization
plt.figure(figsize=(7,5))
sns.countplot(x='ground_truth', data=df, order=['Positive', 'Neutral', 'Negative'])
plt.title("Ground Truth Sentiment Distribution")
plt.show()

plt.figure(figsize=(7,5))
sns.countplot(x='textblob_sentiment', data=df, order=['Positive', 'Neutral', 'Negative'])
plt.title("TextBlob Sentiment Distribution")
plt.show()

plt.figure(figsize=(7,5))
sns.countplot(x='vader_sentiment', data=df, order=['Positive', 'Neutral', 'Negative'])
plt.title("VADER Sentiment Distribution")
plt.show()

# Confusion matrix
cm_vader = confusion_matrix(df['ground_truth'], df['vader_sentiment'], labels=['Positive','Neutral','Negative'])
ConfusionMatrixDisplay(cm_vader, display_labels=['Positive','Neutral','Negative']).plot(cmap='Blues')
plt.title("VADER Confusion Matrix")
plt.show()
