# 03 - Supervised baseline
will load data, explore, decide and preprocessing first

In [1]:
# Install if needed
#!pip install matplotlib

In [2]:
# Install if needed
#!pip install seaborn

In [None]:
# !pip install imblearn

In [23]:
# If needed, install pyarrow
# !pip install pyarrow

In [10]:
# Install if needed
#!pip install scikit-learn imbalanced-learn joblib

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/amira_salah/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2


In [1]:
# Module imports for supervised notebook
import pandas as pd
import re
from typing import Optional

In [8]:
# Import functions from ../scripts
import sys
sys.path.append('../scripts')
from utils import load_data, preprocess_data, get_wordnet_pos, extract_features, clean_and_tokenize, lemmatize_tokens, vectorize_data

In [5]:
# start using functions
# explore before preprocessing
df = load_data()
print('\nPreview before preprocessing (first 10 rows):')
display(df.head(10))


Preview before preprocessing (first 5 rows):


Unnamed: 0,ID,Entity,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
5,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
6,2402,Borderlands,Positive,So I spent a few hours making something for fu...
7,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
8,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
9,2402,Borderlands,Positive,So I spent a few hours making something for fu...


In [7]:
# preprocessing df
proc_df = preprocess_data(df)

# explore after preprocessing
print('\nPreview after preprocessing (first 10 rows):')
display(proc_df.head(10))


Preview after preprocessing (first 10 rows):


Unnamed: 0,ID,Entity,Sentiment,Tweet,hashtags,mentions,tokens,lemmas,processed_text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,,,"[im, getting, on, borderlands, and, i, will, m...","[im, get, on, borderland, and, i, will, murder...",im get on borderland and i will murder you all
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,,,"[I, am, coming, to, the, borders, and, I, will...","[I, be, come, to, the, border, and, I, will, k...",I be come to the border and I will kill you all
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,,,"[im, getting, on, borderlands, and, i, will, k...","[im, get, on, borderland, and, i, will, kill, ...",im get on borderland and i will kill you all
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,,,"[im, coming, on, borderlands, and, i, will, mu...","[im, come, on, borderland, and, i, will, murde...",im come on borderland and i will murder you all
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,,,"[im, getting, on, borderlands, 2, and, i, will...","[im, get, on, borderland, 2, and, i, will, mur...",im get on borderland 2 and i will murder you m...
5,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,,,"[im, getting, into, borderlands, and, i, can, ...","[im, get, into, borderland, and, i, can, murde...",im get into borderland and i can murder you all
6,2402,Borderlands,Positive,So I spent a few hours making something for fu...,,Borderlands,"[So, I, spent, a, few, hours, making, somethin...","[So, I, spend, a, few, hour, make, something, ...",So I spend a few hour make something for fun I...
7,2402,Borderlands,Positive,So I spent a couple of hours doing something f...,,,"[So, I, spent, a, couple, of, hours, doing, so...","[So, I, spend, a, couple, of, hour, do, someth...",So I spend a couple of hour do something for f...
8,2402,Borderlands,Positive,So I spent a few hours doing something for fun...,,,"[So, I, spent, a, few, hours, doing, something...","[So, I, spend, a, few, hour, do, something, fo...",So I spend a few hour do something for fun If ...
9,2402,Borderlands,Positive,So I spent a few hours making something for fu...,,,"[So, I, spent, a, few, hours, making, somethin...","[So, I, spend, a, few, hour, make, something, ...",So I spend a few hour make something for fun I...


# Supervised ML

In [None]:
# just explore class distribution 
counts = proc_df['Sentiment'].value_counts()
print(counts)
minority_label = counts.idxmin()
minority_count = counts.min()
print(f'Minority class: {minority_label} ({minority_count} samples)')

# minority class: Irrelevant (12875 samples)

Sentiment
Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64
Minority class: Irrelevant (12875 samples)


In [None]:
# 1. Prepare data and split
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

X = proc_df['processed_text'].values
y = proc_df['Sentiment'].values

# Stratified split for fair evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Oversample training set for class imbalance
vec = None  # will be defined per model
ros = RandomOverSampler(random_state=42)

In [None]:
# 2. TF-IDF + Logistic Regression Baseline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

vec = TfidfVectorizer(sublinear_tf=True, max_df=0.95, min_df=3)
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)

try:
    X_train_os, y_train_os = ros.fit_resample(X_train_vec, y_train)
except Exception:
    X_train_os, y_train_os = ros.fit_resample(X_train_vec.toarray(), y_train)

logreg = LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42)
logreg.fit(X_train_os, y_train_os)
y_pred_logreg = logreg.predict(X_test_vec)
print('Logistic Regression Test Accuracy:', accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

cv_scores_logreg = cross_val_score(logreg, vec.transform(X), y, cv=5, scoring='accuracy')
print(f'Logistic Regression CV scores: {cv_scores_logreg}')
print(f'Logistic Regression Average CV Accuracy: {cv_scores_logreg.mean():.4f}')

In [None]:
# 3. Random Forest with n-gram (1,2)
from sklearn.ensemble import RandomForestClassifier

rf_vec = TfidfVectorizer(sublinear_tf=True, max_df=0.95, min_df=3, ngram_range=(1,2))
X_train_vec_rf = rf_vec.fit_transform(X_train)
X_test_vec_rf = rf_vec.transform(X_test)

try:
    X_train_os_rf, y_train_os_rf = ros.fit_resample(X_train_vec_rf, y_train)
except Exception:
    X_train_os_rf, y_train_os_rf = ros.fit_resample(X_train_vec_rf.toarray(), y_train)

rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train_os_rf, y_train_os_rf)
y_pred_rf = rf.predict(X_test_vec_rf)
print('Random Forest Test Accuracy:', accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

cv_scores_rf = cross_val_score(rf, rf_vec.transform(X), y, cv=5, scoring='accuracy')
print(f'Random Forest CV scores: {cv_scores_rf}')
print(f'Random Forest Average CV Accuracy: {cv_scores_rf.mean():.4f}')

In [None]:
# 4. Linear SVM for Text
from sklearn.svm import LinearSVC

svm_vec = TfidfVectorizer(sublinear_tf=True, max_df=0.95, min_df=3)
X_train_vec_svm = svm_vec.fit_transform(X_train)
X_test_vec_svm = svm_vec.transform(X_test)

try:
    X_train_os_svm, y_train_os_svm = ros.fit_resample(X_train_vec_svm, y_train)
except Exception:
    X_train_os_svm, y_train_os_svm = ros.fit_resample(X_train_vec_svm.toarray(), y_train)

svm = LinearSVC(max_iter=10000, class_weight='balanced', random_state=42)
svm.fit(X_train_os_svm, y_train_os_svm)
y_pred_svm = svm.predict(X_test_vec_svm)
print('Linear SVM Test Accuracy:', accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

cv_scores_svm = cross_val_score(svm, svm_vec.transform(X), y, cv=5, scoring='accuracy')
print(f'Linear SVM CV scores: {cv_scores_svm}')
print(f'Linear SVM Average CV Accuracy: {cv_scores_svm.mean():.4f}')

In [22]:
# list models saved in ../models/
print('models list:')
for p in sorted(models_dir.glob('*')):
    print('-', p.name)

models list:
- logreg_clf.joblib
- rf_clf.joblib
- rf_tfidf_ngram12_vectorizer.joblib
- svm_clf.joblib
- svm_tfidf_vectorizer.joblib
- tfidf_vectorizer.joblib


In [24]:
# Save processed DataFrame as parquet for this requirement in streamlit:
# Export capabilities for social media reports
proc_df.to_parquet('../models/processed_df.parquet', index=False)
proc_df.to_csv('../models/processed_df.csv', index=False)