# Sentiment Analysis on Social Media Posts (Tweets) — Complete Solution

In [None]:
# Required Libraries
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix, roc_curve, auc)
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

# Download NLTK data (run once)
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Sample Dataset Creation
# Example tweets with sentiment labels (Positive, Negative, Neutral)
data = {
    "tweet": [
        "Loving the new features in the latest update! #awesome 😊",
        "Really tired of the delays and bad service. #frustrated 😠",
        "Just another day, nothing special happening.",
        "The customer support was fantastic! #helpful",
        "I hate when my phone battery dies so fast!!!",
        "Meh, the event was okay, nothing great to share.",
        "Had a wonderful day at the park, feeling great!",
        "Worst experience ever. Not coming back again.",
        "It’s fine I guess, could be better.",
        "Amazing performance by the team! Proud fan here.",
    ],
    "sentiment": [
        "Positive",
        "Negative",
        "Neutral",
        "Positive",
        "Negative",
        "Neutral",
        "Positive",
        "Negative",
        "Neutral",
        "Positive"
    ]
}
df = pd.DataFrame(data)

# 2. Preprocessing tailored for tweets
class TweetPreprocessor:
    def __init__(self):
        self.tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = re.sub(r'http\S+', '', text)  # remove urls
        text = re.sub(r'@\w+', '', text)     # remove mentions
        text = re.sub(r'#', '', text)        # remove hashtag symbol only, keep the word
        text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
        text = re.sub(r'\d+', '', text)      # remove digits
        return text.strip()

    def preprocess(self, text):
        text = self.clean_text(text)
        tokens = self.tokenizer.tokenize(text)
        tokens = [token for token in tokens if token not in self.stop_words and len(token) > 1]
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        return " ".join(tokens)

preprocessor = TweetPreprocessor()
df['processed_tweet'] = df['tweet'].apply(preprocessor.preprocess)

# 3. Encode labels
label_encoder = LabelEncoder()
df['sentiment_label'] = label_encoder.fit_transform(df['sentiment'])

# 4. Train-Test Split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_tweet'], df['sentiment_label'],
    test_size=0.2, random_state=42, stratify=df['sentiment_label']
)

# 5. Model training and evaluation framework
models = {
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM_Linear': SVC(kernel='linear', probability=True),
    'RandomForest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier()
}

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000)

evaluation_results = {}

print("Model training and evaluation:")

for name, model in models.items():
    # Build pipeline
    pipeline = Pipeline([
        ('vect', vectorizer),
        ('clf', model)
    ])
    # Train
    pipeline.fit(X_train, y_train)
    # Predict
    y_pred = pipeline.predict(X_test)
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    evaluation_results[name] = {
        'pipeline': pipeline,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'classification_report': report,
        'confusion_matrix': cm
    }

    print(f"\n{name} --- Accuracy: {acc:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}, F1-Score: {f1:.3f}")
    print("Confusion Matrix:")
    print(cm)

# 6. Visualization of Confusion Matrix for best model (example with LogisticRegression)
best_model_name = 'LogisticRegression'
best_result = evaluation_results[best_model_name]

plt.figure(figsize=(6,5))
sns.heatmap(best_result['confusion_matrix'], annot=True, fmt='d',
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_,
            cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title(f'{best_model_name} Confusion Matrix')
plt.show()

# 7. ROC Curves and AUCs (One-vs-Rest)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

y_test_bin = label_binarize(y_test, classes=[0,1,2])
n_classes = y_test_bin.shape[1]

plt.figure(figsize=(8,6))

for i, class_label in enumerate(label_encoder.classes_):
    y_score = evaluation_results[best_model_name]['pipeline'].predict_proba(X_test)[:, i]
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{class_label} (AUC = {roc_auc:.2f})')

plt.plot([0,1], [0,1], 'k--')
plt.title(f'ROC Curves - {best_model_name}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()

# 8. Prediction Confidence on New Tweets
def predict_sentiment(text, model_pipeline, label_enc):
    processed_text = preprocessor.preprocess(text)
    proba = model_pipeline.predict_proba([processed_text])[0]
    pred_idx = np.argmax(proba)
    pred_label = label_enc.inverse_transform([pred_idx])[0]
    confidence = proba[pred_idx]
    return pred_label, confidence

new_tweets = [
    "Can't wait for the concert tonight! So excited! #fun",
    "So disappointed by the service call, waiting forever...",
    "Nothing really special today, just chillin'."
]

for tweet in new_tweets:
    sentiment, conf = predict_sentiment(tweet, evaluation_results[best_model_name]['pipeline'], label_encoder)
    print(f"Tweet: {tweet}\nPredicted Sentiment: {sentiment} (Confidence: {conf:.2f})\n")

# 9. Optional: Hyperparameter Tuning Example for Logistic Regression
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__solver': ['lbfgs'],
    'clf__max_iter': [1000]
}

grid_search = GridSearchCV(
    Pipeline([
        ('vect', TfidfVectorizer(max_features=1000, ngram_range=(1,2))),
        ('clf', LogisticRegression())
    ]),
    param_grid, cv=5, scoring='f1_macro', n_jobs=-1
)

grid_search.fit(df['processed_tweet'], df['sentiment_label'])
print("Best hyperparameters:", grid_search.best_params_)
print(f"Best cross-validated F1 macro: {grid_search.best_score_:.3f}")



## Explanation
- We created a small sample dataset of tweets labeled with sentiment.

- The preprocessing is customized for social media text, removing URLs, mentions, hashtags, and punctuation using TweetTokenizer and lemmatization.

- We encode sentiment labels and split the data into train/test sets preserving label proportions.

- Multiple classification models (Naive Bayes, Logistic Regression, SVM, Random Forest, KNN) are trained using TF-IDF vectors with unigrams and bigrams.

- Each model is evaluated with accuracy, precision, recall, F1-score, and confusion matrix.

- We visualize confusion matrices and ROC/AUC curves (for Logistic Regression).

- A function for predicting sentiment on new tweets with confidence scores is provided.

- Lastly, hyperparameter tuning via GridSearchCV is demonstrated for Logistic Regression.