In [2]:
!pip install emoji transformers torch scikit-learn imbalanced-learn xgboost optuna nltk textblob

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: emoji, colorlog, optuna
Successfully installed colorlog-6.10.1 emoji-2.15.0 optuna-4.6.0


In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re
import emoji
import torch
from tqdm import tqdm

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_auc_score, precision_recall_curve,make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import pipeline

import optuna
from textblob import TextBlob

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('/content/train.En.csv', index_col=0)


In [5]:
print(f"\nClass distribution:\n{df['sarcastic'].value_counts(normalize=True)}")
df.head()


Class distribution:
sarcastic
0    0.75
1    0.25
Name: proportion, dtype: float64


Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df = df.dropna(subset=['tweet'])
df = df.reset_index(drop=True)
print(f"After dropping NaN: {df.shape}")

After dropping NaN: (3467, 9)


In [7]:
def advanced_clean_text(text):
    if pd.isna(text):
        return ""
    text=text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '[URL]', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '[USER]', text)
    text = re.sub(r'#(\w+)', r'\1', text)

    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'ll", " will", text)
    text = ' '.join(text.split())

    return text

df['tweet_clean'] = df['tweet'].apply(advanced_clean_text)
df[['tweet', 'tweet_clean']].head()

Unnamed: 0,tweet,tweet_clean
0,The only thing I got from college is a caffein...,the only thing i got from college is a caffein...
1,I love it when professors draw a big question ...,i love it when professors draw a big question ...
2,Remember the hundred emails from companies whe...,remember the hundred emails from companies whe...
3,Today my pop-pop told me I was not “forced” to...,today my pop-pop told me i was not “forced” to...
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,"[USER] [USER] [USER] i did too, and i also rep..."


In [8]:
def extract_comprehensive_features(text):
    features = {}

    features['emoji_count'] = len([c for c in text if c in emoji.EMOJI_DATA])
    features['exclamation_count'] = text.count('!')
    features['question_count'] = text.count('?')
    features['ellipsis_count'] = text.count('...')

    words = text.split()
    features['uppercase_words'] = sum(1 for w in words if w.isupper() and len(w) > 1)
    features['capitalized_ratio'] = sum(1 for c in text if c.isupper()) / max(len(text), 1)

    features['repeated_punct'] = len(re.findall(r'([!?.]){2,}', text))
    features['mixed_punct'] = len(re.findall(r'[!?]+', text))

    features['avg_word_length'] = np.mean([len(w) for w in words]) if words else 0
    features['long_words'] = sum(1 for w in words if len(w) > 10)
    features['short_words'] = sum(1 for w in words if len(w) <= 3)

    features['has_but'] = int('but' in text.lower())
    features['has_however'] = int('however' in text.lower())
    features['has_though'] = int('though' in text.lower())
    features['has_quotes'] = text.count('"') + text.count("'")

    features['has_very'] = text.lower().count('very')
    features['has_so'] = text.lower().count(' so ')
    features['has_really'] = text.lower().count('really')
    features['has_totally'] = text.lower().count('totally')
    features['has_literally'] = text.lower().count('literally')

    features['elongation'] = len(re.findall(r'(\w)\1{2,}', text))

    features['char_count'] = len(text)
    features['word_count'] = len(words)

    features['has_url'] = int('[URL]' in text)
    features['has_user'] = int('[USER]' in text)

    return features


# Apply the feature extraction to the clean tweets
features_df = df['tweet_clean'].apply(extract_comprehensive_features).apply(pd.Series)
df = pd.concat([df, features_df], axis=1)

df.head()

Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question,tweet_clean,...,has_very,has_so,has_really,has_totally,has_literally,elongation,char_count,word_count,has_url,has_user
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0,the only thing i got from college is a caffein...,...,0.0,0.0,0.0,0.0,0.0,0.0,57.0,11.0,0.0,0.0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0,i love it when professors draw a big question ...,...,0.0,0.0,0.0,0.0,0.0,0.0,133.0,26.0,0.0,0.0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0,remember the hundred emails from companies whe...,...,0.0,0.0,0.0,0.0,0.0,0.0,229.0,36.0,0.0,1.0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0,today my pop-pop told me i was not “forced” to...,...,0.0,0.0,0.0,0.0,0.0,1.0,80.0,17.0,0.0,0.0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0,"[USER] [USER] [USER] i did too, and i also rep...",...,0.0,0.0,0.0,0.0,0.0,0.0,169.0,29.0,0.0,1.0


In [9]:
X = df['tweet_clean']
y = df['sarcastic']
X_features = features_df.values

X_train, X_test, y_train, y_test, X_train_feat, X_test_feat = train_test_split(
    X, y, X_features, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {len(X_train)}")
print(f"Test set: {len(X_test)}")
print(f"\nTrain distribution:\n{y_train.value_counts(normalize=True)}")

Train set: 2773
Test set: 694

Train distribution:
sarcastic
0    0.75009
1    0.24991
Name: proportion, dtype: float64


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')
roberta_model.eval()
roberta_model.to(device)

def get_roberta_embedding_batch(texts, batch_size=32):
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]

        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            truncation=True,
            max_length=128,
            padding=True
        ).to(device)

        with torch.no_grad():
            outputs = roberta_model(**inputs)

        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)

    return np.vstack(embeddings)

print("Extracting RoBERTa embeddings for training set...")
X_train_roberta = get_roberta_embedding_batch(X_train.tolist(), batch_size=32)

print("Extracting RoBERTa embeddings for test set...")
X_test_roberta = get_roberta_embedding_batch(X_test.tolist(), batch_size=32)

print(f"Train embeddings shape: {X_train_roberta.shape}")
print(f"Test embeddings shape: {X_test_roberta.shape}")

Using device: cpu


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracting RoBERTa embeddings for training set...


100%|██████████| 87/87 [15:45<00:00, 10.87s/it]


Extracting RoBERTa embeddings for test set...


100%|██████████| 22/22 [02:48<00:00,  7.67s/it]

Train embeddings shape: (2773, 768)
Test embeddings shape: (694, 768)





In [11]:
scaler = StandardScaler()
X_train_feat_scaled = scaler.fit_transform(X_train_feat)
X_test_feat_scaled = scaler.transform(X_test_feat)

X_train_combined = np.hstack([X_train_roberta, X_train_feat_scaled])
X_test_combined = np.hstack([X_test_roberta, X_test_feat_scaled])

print(f"Combined train shape: {X_train_combined.shape}")
print(f"Combined test shape: {X_test_combined.shape}")

Combined train shape: (2773, 793)
Combined test shape: (694, 793)


In [12]:
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train_combined, y_train)

print(f"Original train shape: {X_train_combined.shape}")
print(f"Balanced train shape: {X_train_balanced.shape}")
print(f"\nBalanced distribution:\n{pd.Series(y_train_balanced).value_counts(normalize=True)}")

Original train shape: (2773, 793)
Balanced train shape: (4086, 793)

Balanced distribution:
sarcastic
1    0.5
0    0.5
Name: proportion, dtype: float64


In [54]:
def objective_lr(trial):
    params = {
        'C': trial.suggest_float('C', 0.001, 100, log=True),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'saga']),
        'max_iter': 1000,
        'random_state': 42
    }

    model = LogisticRegression(**params)

    scores = cross_val_score(
        model, X_train_balanced, y_train_balanced,
        cv=3, scoring= make_scorer(f1_score, pos_label=1), n_jobs=-1
    )

    return scores.mean()

print("Optimizing Logistic Regression hyperparameters...")
study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(objective_lr, n_trials=20, show_progress_bar=True)

print(f"\nBest LR F1: {study_lr.best_value:.4f}")
print(f"Best params: {study_lr.best_params}")

[I 2025-11-18 01:49:56,630] A new study created in memory with name: no-name-8dc5fd4b-eec7-4107-a99b-f2e90f1964b9


Optimizing Logistic Regression hyperparameters...


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-11-18 01:51:15,365] Trial 0 finished with value: 0.6064598015942105 and parameters: {'C': 0.014388279798556362, 'solver': 'saga'}. Best is trial 0 with value: 0.6064598015942105.
[I 2025-11-18 01:51:27,735] Trial 1 finished with value: 0.7577150509540616 and parameters: {'C': 54.345045255274314, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7577150509540616.
[I 2025-11-18 01:52:03,264] Trial 2 finished with value: 0.601168758933392 and parameters: {'C': 0.0046616013726508395, 'solver': 'saga'}. Best is trial 1 with value: 0.7577150509540616.
[I 2025-11-18 01:53:42,588] Trial 3 finished with value: 0.6387569781971082 and parameters: {'C': 0.08761917615668842, 'solver': 'saga'}. Best is trial 1 with value: 0.7577150509540616.
[I 2025-11-18 01:55:43,935] Trial 4 finished with value: 0.696910289014645 and parameters: {'C': 6.51481920933189, 'solver': 'saga'}. Best is trial 1 with value: 0.7577150509540616.
[I 2025-11-18 01:55:44,384] Trial 5 finished with value: 0.5998081336568

In [55]:
best_lr = LogisticRegression(**study_lr.best_params, max_iter=1000, random_state=42)
best_lr.fit(X_train_balanced, y_train_balanced)
y_pred = best_lr.predict(X_test_combined)
y_pred_proba = best_lr.predict_proba(X_test)[:, 1]
print("\nClassification Report Logistic Regression:")
print(classification_report(y_test, y_pred, digits=4))



Classification Report Logistic Regression:
              precision    recall  f1-score   support

           0     0.8076    0.6942    0.7466       520
           1     0.3563    0.5057    0.4181       174

    accuracy                         0.6470       694
   macro avg     0.5819    0.6000    0.5823       694
weighted avg     0.6944    0.6470    0.6643       694



In [21]:
lr= LogisticRegression(C= 90, solver='lbfgs', max_iter=1000, random_state=42)
lr.fit(X_train_balanced, y_train_balanced)
y_pred = lr.predict(X_test_combined)
y_pred_proba = lr.predict_proba(X_test_combined)[:, 1]
print("\nClassification Report Logistic Regression:")
print(classification_report(y_test, y_pred, digits=4))


Classification Report Logistic Regression:
              precision    recall  f1-score   support

           0     0.8090    0.6923    0.7461       520
           1     0.3574    0.5115    0.4208       174

    accuracy                         0.6470       694
   macro avg     0.5832    0.6019    0.5835       694
weighted avg     0.6958    0.6470    0.6646       694



In [56]:
def objective_xgb(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth' , 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)

    scores = cross_val_score(
        model, X_train_balanced, y_train_balanced,
        cv=3, scoring= make_scorer(f1_score, pos_label=1), n_jobs=-1
    )

    return scores.mean()

print("Optimizing XGBoost hyperparameters...")
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=20, show_progress_bar=True)

print(f"\nBest XGBoost F1: {study_xgb.best_value:.4f}")
print(f"Best params: {study_xgb.best_params}")

[I 2025-11-18 01:59:45,205] A new study created in memory with name: no-name-64de4a28-7499-4d01-a6af-033cdfa27aee


Optimizing XGBoost hyperparameters...


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-11-18 02:01:44,285] Trial 0 finished with value: 0.8512704858434246 and parameters: {'max_depth': 9, 'learning_rate': 0.047417760808805415, 'n_estimators': 309, 'min_child_weight': 1, 'subsample': 0.9277875204513576, 'colsample_bytree': 0.6676139121860095, 'gamma': 1.5412308656685862, 'reg_alpha': 4.57131271835514, 'reg_lambda': 2.041171063620456}. Best is trial 0 with value: 0.8512704858434246.
[I 2025-11-18 02:02:30,182] Trial 1 finished with value: 0.8286637708724224 and parameters: {'max_depth': 10, 'learning_rate': 0.11927336763982063, 'n_estimators': 208, 'min_child_weight': 8, 'subsample': 0.7187844597037767, 'colsample_bytree': 0.8343294467831982, 'gamma': 1.6774785982133973, 'reg_alpha': 4.444127456543827, 'reg_lambda': 1.5438639341430331}. Best is trial 0 with value: 0.8512704858434246.
[I 2025-11-18 02:04:53,249] Trial 2 finished with value: 0.8525663492372945 and parameters: {'max_depth': 7, 'learning_rate': 0.020991190617263924, 'n_estimators': 284, 'min_child_weig

In [57]:
best_xgb = XGBClassifier(**study_xgb.best_params)
best_xgb.fit(X_train_balanced, y_train_balanced)
y_pred = best_xgb.predict(X_test_combined)
y_pred_proba = best_xgb.predict_proba(X_test_combined)[:, 1]
print("\nClassification Report XGBoost:")
print(classification_report(y_test, y_pred, digits=4))


Classification Report XGBoost:
              precision    recall  f1-score   support

           0     0.7941    0.8827    0.8361       520
           1     0.4741    0.3161    0.3793       174

    accuracy                         0.7406       694
   macro avg     0.6341    0.5994    0.6077       694
weighted avg     0.7139    0.7406    0.7215       694



In [40]:
xgb = XGBClassifier(max_depth= 6, learning_rate=0.03, n_estimators= 400, min_child_weight= 4, subsample= 0.9, colsample_bytree= 0.9, gamma=0.003, reg_alpha= 3.5, reg_lambda= 3.6)
xgb.fit(X_train_balanced, y_train_balanced)
y_pred = xgb.predict(X_test_combined)
y_pred_proba = xgb.predict_proba(X_test_combined)[:, 1]
print("\nClassification Report XGBoost:")
print(classification_report(y_test, y_pred, digits=4))


Classification Report XGBoost:
              precision    recall  f1-score   support

           0     0.7943    0.8615    0.8266       520
           1     0.4462    0.3333    0.3816       174

    accuracy                         0.7291       694
   macro avg     0.6202    0.5974    0.6041       694
weighted avg     0.7070    0.7291    0.7150       694



In [33]:
from sklearn.svm import SVC
def objective_svc(trial):
    params = {
        'C': trial.suggest_float('C', 0.01, 100, log=True),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),
        'degree': trial.suggest_int('degree', 2, 5),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'random_state': 42
    }

    model = SVC(**params)

    scores = cross_val_score(
        model, X_train_balanced, y_train_balanced,
        cv=3, scoring=make_scorer(f1_score, pos_label=1), n_jobs=-1
    )

    return scores.mean()

print("Optimizing SVC hyperparameters...")
study_svc = optuna.create_study(direction='maximize')
study_svc.optimize(objective_svc, n_trials=20, show_progress_bar=True)

print(f"\nBest SVC F1: {study_svc.best_value:.4f}")
print(f"Best params: {study_svc.best_params}")

[I 2025-11-18 05:52:51,008] A new study created in memory with name: no-name-0f009575-3e9d-4733-9a58-7c624d30d6aa


Optimizing SVC hyperparameters...


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-11-18 05:53:24,576] Trial 0 finished with value: 0.6635887386260468 and parameters: {'C': 0.029901306641247, 'kernel': 'linear', 'gamma': 'auto', 'degree': 5, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6635887386260468.
[I 2025-11-18 05:54:06,020] Trial 1 finished with value: 0.7546788727667911 and parameters: {'C': 83.64337050711964, 'kernel': 'rbf', 'gamma': 'scale', 'degree': 3, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.7546788727667911.
[I 2025-11-18 05:54:27,554] Trial 2 finished with value: 0.7663541822202257 and parameters: {'C': 89.1263946437994, 'kernel': 'poly', 'gamma': 'scale', 'degree': 3, 'class_weight': None}. Best is trial 2 with value: 0.7663541822202257.
[I 2025-11-18 05:54:54,665] Trial 3 finished with value: 0.7082982834003005 and parameters: {'C': 12.984413543233593, 'kernel': 'poly', 'gamma': 'scale', 'degree': 3, 'class_weight': 'balanced'}. Best is trial 2 with value: 0.7663541822202257.
[I 2025-11-18 05:55:18,908] Trial 

In [43]:
best_svc=SVC(**study_svc.best_params,probability=True)
best_svc.fit(X_train_balanced, y_train_balanced)
y_pred = best_svc.predict(X_test_combined)

print("\nClassification Report SVC:")
print(classification_report(y_test, y_pred, digits=4))


Classification Report SVC:
              precision    recall  f1-score   support

           0     0.8151    0.6865    0.7453       520
           1     0.3633    0.5345    0.4326       174

    accuracy                         0.6484       694
   macro avg     0.5892    0.6105    0.5889       694
weighted avg     0.7018    0.6484    0.6669       694



In [58]:
def objective_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'random_state': 42,
        'n_jobs': -1
    }

    model = RandomForestClassifier(**params)

    scores = cross_val_score(
        model, X_train_balanced, y_train_balanced,
        cv=3, scoring= make_scorer(f1_score, pos_label=1), n_jobs=-1
    )

    return scores.mean()

print("Optimizing Random Forest hyperparameters...")
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=20, show_progress_bar=True)

print(f"\nBest RF F1: {study_rf.best_value:.4f}")
print(f"Best params: {study_rf.best_params}")

[I 2025-11-18 02:35:54,309] A new study created in memory with name: no-name-1a29d806-2e85-4d78-8ebd-71ef548bc9f3


Optimizing Random Forest hyperparameters...


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-11-18 02:36:03,449] Trial 0 finished with value: 0.8416455936722312 and parameters: {'n_estimators': 116, 'max_depth': 28, 'min_samples_split': 10, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 0 with value: 0.8416455936722312.
[I 2025-11-18 02:36:17,337] Trial 1 finished with value: 0.8342472543998136 and parameters: {'n_estimators': 170, 'max_depth': 23, 'min_samples_split': 19, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 0 with value: 0.8416455936722312.
[I 2025-11-18 02:36:39,573] Trial 2 finished with value: 0.8625357261391747 and parameters: {'n_estimators': 288, 'max_depth': 12, 'min_samples_split': 15, 'min_samples_leaf': 1, 'max_features': 'log2'}. Best is trial 2 with value: 0.8625357261391747.
[I 2025-11-18 02:37:05,859] Trial 3 finished with value: 0.8444185839775412 and parameters: {'n_estimators': 360, 'max_depth': 13, 'min_samples_split': 11, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 2 with value: 0.8625357261

In [59]:
best_rf = RandomForestClassifier(**study_rf.best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train_balanced, y_train_balanced)
y_pred = best_rf.predict(X_test_combined)
y_pred_proba = best_rf.predict_proba(X_test_combined)[:, 1]
print("\nClassification Report RandomForestClassifier:")
print(classification_report(y_test, y_pred, digits=4))


Classification Report RandomForestClassifier:
              precision    recall  f1-score   support

           0     0.7769    0.9173    0.8413       520
           1     0.4625    0.2126    0.2913       174

    accuracy                         0.7406       694
   macro avg     0.6197    0.5650    0.5663       694
weighted avg     0.6981    0.7406    0.7034       694



In [46]:
ensemble = VotingClassifier(
    estimators=[
        ('lr', lr),
        ('svc', best_svc)
    ],
    voting='hard',
    weights=[1, 2]
)

print("Training ensemble model...")
ensemble.fit(X_train_balanced, y_train_balanced)
y_pred=ensemble.predict(X_test_combined)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

Training ensemble model...

Classification Report:
              precision    recall  f1-score   support

           0     0.8042    0.7423    0.7720       520
           1     0.3738    0.4598    0.4124       174

    accuracy                         0.6715       694
   macro avg     0.5890    0.6010    0.5922       694
weighted avg     0.6963    0.6715    0.6818       694



In [47]:
df_test=pd.read_csv('/content/task_A_En_test.csv')
df_test

Unnamed: 0,tweet,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0
...,...,...
1395,I’ve just seen this and felt it deserved a Ret...,0
1396,Omg how an earth is that a pen !!! 🤡,0
1397,Bringing Kanye and drake to a tl near you,0
1398,"I love it when women are referred to as ""girl ...",1


In [51]:
features_df = df['tweet'].apply(extract_comprehensive_features).apply(pd.Series)
df_test = pd.concat([df_test, features_df], axis=1)

In [None]:
X_test_text = df_test['tweet'].fillna('').astype(str).tolist()  # Convert to string and handle NaN
X_test1 = get_roberta_embedding_batch(X_test_text, batch_size=32)

# Step 2: Get other features
X_feat = df_test.drop(columns=['tweet', 'sarcastic'])

# Step 3: Stack embeddings with other features
X_test1 = np.hstack([X_test1, X_feat.values])
y_true=df_test['sarcastic']
# Step 4: Make predictions
y_pred = best_svc.predict(X_test1)
print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4))

 18%|█▊        | 20/109 [02:39<13:01,  8.78s/it]