# 03 - Supervised baseline
will load data, explore, decide and preprocessing first

In [62]:
# Install if needed
#!pip install matplotlib

In [63]:
# Install if needed
#!pip install seaborn

In [64]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

Path to dataset files: /Users/amira_salah/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2


In [65]:
# Module imports for supervised notebook
import pandas as pd
import re
from typing import Optional

In [None]:
# Import functions from ../scripts
import sys
sys.path.append('../scripts')
from supervised_utils import load_data, preprocess_text, preprocess_df

In [68]:
# preprocess_text is now imported from scripts/supervised_utils.py
# text cleaning for tweets.
# - Converts to lowercase
# - Removes URLs, mentions and extra punctuation
# - Removes extra whitespace
# Example usage:
# preprocess_text("my Name us # amira                          @")

In [69]:
# just explore 
df = load_data()
counts = df['Sentiment'].value_counts()
print(counts)
minority_label = counts.idxmin()
minority_count = counts.min()
print(f'Minority class: {minority_label} ({minority_count} samples)')

print("Sample Irrelevant tweets:")
display(df[df['Sentiment'] == 'Irrelevant'].head(20))
print("Sample Neutral tweets:")
display(df[df['Sentiment'] == 'Neutral'].head(20))

Sentiment
Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64
Minority class: Irrelevant (12875 samples)
Sample Irrelevant tweets:


Unnamed: 0,ID,Entity,Sentiment,Tweet
101,2418,Borderlands,Irrelevant,Appreciate the (sonic) concepts / praxis Valen...
102,2418,Borderlands,Irrelevant,Appreciate the (sound) concepts / practices th...
103,2418,Borderlands,Irrelevant,Evaluate the (sound) concepts / concepts of Va...
104,2418,Borderlands,Irrelevant,Appreciate the (sonic) concepts / praxis Valen...
105,2418,Borderlands,Irrelevant,Appreciate by the ( sonic ) electronic concept...
106,2418,Borderlands,Irrelevant,Appreciate the (sonic) conversations / actions...
125,2422,Borderlands,Irrelevant,Loving these new @GhostLifestyle cans!! Anyone...
126,2422,Borderlands,Irrelevant,I love these new @ GhostLifestyle cans!! Every...
127,2422,Borderlands,Irrelevant,Love these new @ GhostLive cans!! Does anyone ...
128,2422,Borderlands,Irrelevant,Loving these new @GhostLifestyle cans!! Anyone...


Sample Neutral tweets:


Unnamed: 0,ID,Entity,Sentiment,Tweet
12,2403,Borderlands,Neutral,"Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM..."
13,2403,Borderlands,Neutral,"Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM..."
14,2403,Borderlands,Neutral,"Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM..."
15,2403,Borderlands,Neutral,"Rock-Hard La Vita, RARE BUT POWERFUL, HANDSOME..."
16,2403,Borderlands,Neutral,"Live Rock - Hard music La la Varlope, RARE & t..."
17,2403,Borderlands,Neutral,"I-Hard like me, RARE LONDON DE, HANDSOME 2011,..."
42,2408,Borderlands,Neutral,Check out this epic streamer!.
43,2408,Borderlands,Neutral,Check out this epic streamer!.
44,2408,Borderlands,Neutral,Watch this epic striptease!.
45,2408,Borderlands,Neutral,Check out our epic streamer!.


In [70]:
# preprocess_df is now imported from scripts/supervised_utils.py
# Apply preprocessing to DataFrame and add new columns.
# Adds:
#   - processed_text
#   - label (integer mapping for sentiment)
# Example usage:
# preprocess_df(load_data())

In [74]:
# start using functions
# explore before preprocessing
df = load_data()
print('\nPreview before preprocessing (first 5 rows):')
display(df.head(10))


Preview before preprocessing (first 5 rows):


Unnamed: 0,ID,Entity,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
5,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
6,2402,Borderlands,Positive,So I spent a few hours making something for fu...
7,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
8,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
9,2402,Borderlands,Positive,So I spent a few hours making something for fu...


In [75]:
# Install if needed
#!pip install scikit-learn imbalanced-learn joblib

In [77]:
# preprocessing df
proc_df = preprocess_df(df)

# explore after preprocessing
print('\nPreview after preprocessing (first 5 rows):')
display(proc_df.head(10)[['Tweet', 'processed_text', 'Sentiment', 'label']])


Preview after preprocessing (first 5 rows):


Unnamed: 0,Tweet,processed_text,Sentiment,label
0,im getting on borderlands and i will murder yo...,im getting on borderlands and i will murder yo...,Positive,2
1,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...,Positive,2
2,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all,Positive,2
3,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...,Positive,2
4,im getting on borderlands 2 and i will murder ...,im getting on borderlands 2 and i will murder ...,Positive,2
5,im getting into borderlands and i can murder y...,im getting into borderlands and i can murder y...,Positive,2
6,So I spent a few hours making something for fu...,so i spent a few hours making something for fu...,Positive,2
7,So I spent a couple of hours doing something f...,so i spent a couple of hours doing something f...,Positive,2
8,So I spent a few hours doing something for fun...,so i spent a few hours doing something for fun...,Positive,2
9,So I spent a few hours making something for fu...,so i spent a few hours making something for fu...,Positive,2


# Supervised ML

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
import joblib

# Create model output directory - organize
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Prepare
base_tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.95, min_df=3)
base_tfidf_ngram12 = TfidfVectorizer(sublinear_tf=True, max_df=0.95, min_df=3, ngram_range=(1,2))

In [79]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg_vectorizer_config = dict(sublinear_tf=True, max_df=0.95, min_df=3)
logreg_clf = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)

In [80]:
# just explore agian 
counts = proc_df['Sentiment'].value_counts()
print(counts)
minority_label = counts.idxmin()
minority_count = counts.min()
print(f'Minority class: {minority_label} ({minority_count} samples)')

# minority class: Positive (20832 samples)

Sentiment
Neutral     30983
Negative    22358
Positive    20655
Name: count, dtype: int64
Minority class: Positive (20655 samples)


In [81]:
# RandomForest 
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler

rf_vectorizer_config = dict(sublinear_tf=True, max_df=0.95, min_df=3, ngram_range=(1,2))
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
ros = RandomOverSampler(random_state=42) # Handle class imbalance with duplicate minority class samples(Positive)

In [82]:
# Linear SVM
from sklearn.svm import LinearSVC

svm_vectorizer_config = dict(sublinear_tf=True, max_df=0.95, min_df=3)
svm_clf = LinearSVC(max_iter=10000, class_weight='balanced', random_state=42) 

In [83]:
# Prepare data, scoring and cross-validation
try:
    proc_df
except NameError: # if error or not found load data again and preprocess
    df = load_data()
    proc_df = preprocess_df(df)

X = proc_df['processed_text'].values
y = proc_df['Sentiment'].values

# Define the metrics we care about
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0)
}

# cross validation
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [84]:
# 1) Cross-validation for Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

logreg_scores = {'accuracy':[], 'precision':[], 'recall':[], 'f1':[]}

for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):

    X_train_text = X[train_idx]
    y_train = y[train_idx]

    X_test_text = X[test_idx]
    y_test = y[test_idx]

    vec = TfidfVectorizer(**logreg_vectorizer_config)
    X_train_vec = vec.fit_transform(X_train_text)
    X_test_vec = vec.transform(X_test_text)

    clf = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
    clf.fit(X_train_vec, y_train)
    preds = clf.predict(X_test_vec)

    # prepare results
    logreg_scores['accuracy'].append(accuracy_score(y_test, preds))
    logreg_scores['precision'].append(precision_score(y_test, preds, average='macro', zero_division=0))
    logreg_scores['recall'].append(recall_score(y_test, preds, average='macro', zero_division=0))
    logreg_scores['f1'].append(f1_score(y_test, preds, average='macro', zero_division=0))

print('LogReg CV:')
for k,v in logreg_scores.items():
    print('-', k, sum(v)/len(v))

cv_results = {'Logistic Regression': logreg_scores}

LogReg CV:
- accuracy 0.8030028362033228
- precision 0.7996794553020705
- recall 0.8073381213940338
- f1 0.8025866563389773


In [85]:
# 2) Cross-validation for RandomForest
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

rf_scores = {'accuracy':[], 'precision':[], 'recall':[], 'f1':[]}

for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):

    X_train_text = X[train_idx]
    y_train = y[train_idx]

    X_test_text = X[test_idx]
    y_test = y[test_idx]

    vec = TfidfVectorizer(**rf_vectorizer_config)
    X_train_vec = vec.fit_transform(X_train_text)
    X_test_vec = vec.transform(X_test_text)

    # oversample the training set
    try:
        X_train_os, y_train_os = RandomOverSampler(random_state=42).fit_resample(X_train_vec, y_train)
    except Exception: # handeled after multiple errors
        # some samplers expect dense input; convert if necessary
        X_train_os, y_train_os = RandomOverSampler(random_state=42).fit_resample(X_train_vec.toarray(), y_train)
    
    clf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=5)
    clf.fit(X_train_os, y_train_os)

    preds = clf.predict(X_test_vec)

    # prepare results
    rf_scores['accuracy'].append(accuracy_score(y_test, preds))
    rf_scores['precision'].append(precision_score(y_test, preds, average='macro', zero_division=0))
    rf_scores['recall'].append(recall_score(y_test, preds, average='macro', zero_division=0))
    rf_scores['f1'].append(f1_score(y_test, preds, average='macro', zero_division=0))

print('RandomForest CV:')
for k,v in rf_scores.items():
    print('-', k, sum(v)/len(v))

cv_results['rf'] = rf_scores

RandomForest CV:
- accuracy 0.5680307334498497
- precision 0.5936903649763224
- recall 0.5949875295261942
- f1 0.567473493638756


In [None]:
# 3) Cross-validation for Linear SVM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

svm_scores = {'accuracy':[], 'precision':[], 'recall':[], 'f1':[]}

for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):

    X_train_text = X[train_idx]
    y_train = y[train_idx]

    X_test_text = X[test_idx]
    y_test = y[test_idx]

    vec = TfidfVectorizer(**svm_vectorizer_config)
    X_train_vec = vec.fit_transform(X_train_text)
    X_test_vec = vec.transform(X_test_text)

    clf = LinearSVC(max_iter=10000, class_weight='balanced', random_state=42)
    clf.fit(X_train_vec, y_train)

    preds = clf.predict(X_test_vec)

    # prepare results
    svm_scores['accuracy'].append(accuracy_score(y_test, preds))
    svm_scores['precision'].append(precision_score(y_test, preds, average='macro', zero_division=0))
    svm_scores['recall'].append(recall_score(y_test, preds, average='macro', zero_division=0))
    svm_scores['f1'].append(f1_score(y_test, preds, average='macro', zero_division=0))

print('Linear SVM CV:')
for k,v in svm_scores.items():
    print('-', k, sum(v)/len(v))

cv_results['svm'] = svm_scores
# SVM CV is the best acuratest model so far

Linear SVM CV:
- accuracy 0.859857262999874
- precision 0.8572663181928558
- recall 0.8601736559962607
- f1 0.8586094162484642


In [87]:
# Fit final models on FULL dataset and save model 

# Logistic Regression final fit
from sklearn.feature_extraction.text import TfidfVectorizer

logreg_vec = TfidfVectorizer(**logreg_vectorizer_config)

X_full_vec = logreg_vec.fit_transform(X)

logreg_clf_full = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
logreg_clf_full.fit(X_full_vec, y)

joblib.dump(logreg_vec, models_dir / 'tfidf_vectorizer.joblib')
joblib.dump(logreg_clf_full, models_dir / 'logreg_clf.joblib')

# RandomForest final fit
rf_vec = TfidfVectorizer(**rf_vectorizer_config)
X_full_vec_rf = rf_vec.fit_transform(X)
# oversample full training set for RandomForest
try:
    X_os, y_os = RandomOverSampler(random_state=42).fit_resample(X_full_vec_rf, y)
except Exception:
    X_os, y_os = RandomOverSampler(random_state=42).fit_resample(X_full_vec_rf.toarray(), y)

rf_clf_full = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_clf_full.fit(X_os, y_os)

joblib.dump(rf_vec, models_dir / 'rf_tfidf_ngram12_vectorizer.joblib')
joblib.dump(rf_clf_full, models_dir / 'rf_clf.joblib')

# LinearSVC final fit
svm_vec = TfidfVectorizer(**svm_vectorizer_config)

X_full_vec_svm = svm_vec.fit_transform(X)

svm_clf_full = LinearSVC(max_iter=10000, class_weight='balanced', random_state=42)
svm_clf_full.fit(X_full_vec_svm, y)

joblib.dump(svm_vec, models_dir / 'svm_tfidf_vectorizer.joblib')
joblib.dump(svm_clf_full, models_dir / 'svm_clf.joblib')

print('models saved in:', models_dir)

models saved in: ../models


In [88]:
# list models saved in ../models/
print('models list:')
for p in sorted(models_dir.glob('*')):
    print('-', p.name)

models list:
- logreg_clf.joblib
- preprocessed_df.csv
- preprocessed_df.parquet
- rf_clf.joblib
- rf_tfidf_ngram12_vectorizer.joblib
- svm_clf.joblib
- svm_tfidf_vectorizer.joblib
- tfidf_vectorizer.joblib


In [89]:
# If needed, install pyarrow
# !pip install pyarrow

In [90]:
# Save processed DataFrame as parquet for this requirement in streamlit:
# Export capabilities for social media reports
proc_df.to_parquet('../models/processed_df.parquet', index=False)
proc_df.to_csv('../models/processed_df.csv', index=False)