# Twitter Sentiment Analysis

**Objective:** Classify tweets as Positive, Negative, or Neutral using NLP and Transformer models.  
**Dataset:** 1,03,250+ training tweets, 16k+ validation tweets.  
**Model:** DistilBERT fine-tuned for sequence classification.  

**Workflow Highlights:**
- Data preprocessing & label mapping
- Tokenization with HuggingFace tokenizer
- Model fine-tuning with Trainer API
- Evaluation & metrics visualization
- WordCloud analysis per sentiment

### STEP 1 : Import Libraries & Setup

In [None]:
# Basic libraries
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# HuggingFace Transformers & Datasets
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import evaluate


### STEP 2 : Upload & Preprocess Data

In [None]:
from google.colab import files

# Upload CSVs
uploaded = files.upload()

# Define column names
cols = ['id', 'entity', 'sentiment', 'text']

# Load datasets
train_df = pd.read_csv('twitter_training.csv', names=cols, header=0)
val_df   = pd.read_csv('twitter_validation.csv', names=cols, header=0)

# Map labels to integers & drop 'Irrelevant'
label_map = {'Negative':0, 'Positive':1, 'Neutral':2}
train_df = train_df[train_df['sentiment'].isin(label_map.keys())]
val_df   = val_df[val_df['sentiment'].isin(label_map.keys())]
train_df['label'] = train_df['sentiment'].map(label_map)
val_df['label']   = val_df['sentiment'].map(label_map)

# Ensure text is string & handle NaN
train_df['text'] = train_df['text'].astype(str).fillna('')
val_df['text']   = val_df['text'].astype(str).fillna('')

# Quick check
train_df.head()


### STEP 3 : Convert to HuggingFace Dataset & Tokenize

In [None]:
# Convert to Dataset
train_ds = Dataset.from_pandas(train_df[['text','label']])
val_ds   = Dataset.from_pandas(val_df[['text','label']])

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

# Apply tokenization
train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)

# Set format for PyTorch
train_ds.set_format('torch', columns=['input_ids','attention_mask','label'])
val_ds.set_format('torch', columns=['input_ids','attention_mask','label'])


### STEP 4 : Load Model

In [None]:
num_labels = len(label_map)
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)


### STEP 5 : Training Arguments & Metrics

In [None]:
# Metrics function
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics_simple(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none"
)


### STEP 6 : Train the Model

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_simple
)

trainer.train()


### STEP 7: Evaluate Model

In [None]:
eval_results = trainer.evaluate()
print("\n Evaluation Results:")
for k, v in eval_results.items():
    print(f"{k}: {v:.4f}")


STEP 8 : Save Model & Tokenizer

In [None]:
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")
print("\n Model and tokenizer saved to ./sentiment_model")


STEP 9 : Sample Predictions

In [None]:
id2label = {v:k for k,v in label_map.items()}

def predict_sentiment(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(model.device)
    outputs = model(**inputs)
    preds = outputs.logits.argmax(dim=1).cpu().numpy()
    return [id2label[p] for p in preds]

sample_texts = [
    "I absolutely love the new update, great job!",
    "Worst experience ever, I am so disappointed.",
    "It is okay, neither good nor bad.",
    "The bug in the app keeps crashing, very frustrating.",
    "Thank you for the amazing support, really appreciated!"
]

predictions = predict_sentiment(sample_texts)
for text, pred in zip(sample_texts, predictions):
    print(f"{text}  -->  {pred}")


### STEP 10 : WordCloud Visualization

In [None]:
def plot_wordcloud(df, sentiment_label, max_words=50):
    text = " ".join(df[df['label'] == sentiment_label]['text'].astype(str))
    wordcloud = WordCloud(width=800, height=400, background_color="white", max_words=max_words).generate(text)

    plt.figure(figsize=(12,6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for {id2label[sentiment_label]} Tweets", fontsize=20)
    plt.show()

    words = text.split()
    freq_dist = pd.Series(words).value_counts().head(15)
    print(f"Top words for {id2label[sentiment_label]}:")
    print(freq_dist)
    print("\n" + "-"*50 + "\n")

# Plot for each sentiment
for label in range(len(label_map)):
    plot_wordcloud(train_df, label)
