In [11]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertModel
import torch
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier



In [2]:

df = pd.read_csv('iSarcasmEval_EN/train.En.csv', index_col=0)
df.head()

Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0
1,I love it when professors draw a big question ...,1,I do not like when professors don’t write out ...,1.0,0.0,0.0,0.0,0.0,0.0
2,Remember the hundred emails from companies whe...,1,"I, at the bare minimum, wish companies actuall...",0.0,1.0,0.0,0.0,0.0,0.0
3,Today my pop-pop told me I was not “forced” to...,1,"Today my pop-pop told me I was not ""forced"" to...",1.0,0.0,0.0,0.0,0.0,0.0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,I would say Ted Cruz is an asshole and doesn’t...,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print(f"Nombre de lignes : {len(df)}")
print(f"Nombre de colonnes : {len(df.columns)}")
print(f"\nColonnes : {list(df.columns)}")

Nombre de lignes : 3468
Nombre de colonnes : 9

Colonnes : ['tweet', 'sarcastic', 'rephrase', 'sarcasm', 'irony', 'satire', 'understatement', 'overstatement', 'rhetorical_question']


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3468 entries, 0 to 3467
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tweet                3467 non-null   object 
 1   sarcastic            3468 non-null   int64  
 2   rephrase             867 non-null    object 
 3   sarcasm              867 non-null    float64
 4   irony                867 non-null    float64
 5   satire               867 non-null    float64
 6   understatement       867 non-null    float64
 7   overstatement        867 non-null    float64
 8   rhetorical_question  867 non-null    float64
dtypes: float64(6), int64(1), object(2)
memory usage: 270.9+ KB


In [None]:
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Colonne': missing_values.index,
    'Valeurs manquantes': missing_values.values,
    'Pourcentage': missing_percent.values
})

missing_df[missing_df['Valeurs manquantes'] > 0]

Unnamed: 0,Colonne,Valeurs manquantes,Pourcentage
0,tweet,1,0.028835
2,rephrase,2601,75.0
3,sarcasm,2601,75.0
4,irony,2601,75.0
5,satire,2601,75.0
6,understatement,2601,75.0
7,overstatement,2601,75.0
8,rhetorical_question,2601,75.0


In [None]:
sarcastic_counts = df['sarcastic'].value_counts()
sarcastic_percent = df['sarcastic'].value_counts(normalize=True) * 100

print("Distribution des classes :")
for val, count in sarcastic_counts.items():
    label = "Sarcastique" if val == 1 else "Non-sarcastique"
    print(f"{label:20s} : {count} ({sarcastic_percent[val]}%)")

Distribution des classes :
Non-sarcastique      : 2601 (75.0%)
Sarcastique          : 867 (25.0%)


In [3]:
def clean_text(text):
    if pd.isna(text):
        return ""
    
    text = text.lower()
    
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    text = re.sub(r'@\w+', '', text)
    
    text = re.sub(r'#(\w+)', r'\1', text)
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    text = re.sub(r'\d+', '', text)
    
    text = ' '.join(text.split())
    
    return text

df['tweet_clean'] = df['tweet'].apply(clean_text)
df[['tweet', 'tweet_clean']].head()

Unnamed: 0,tweet,tweet_clean
0,The only thing I got from college is a caffein...,the only thing i got from college is a caffein...
1,I love it when professors draw a big question ...,i love it when professors draw a big question ...
2,Remember the hundred emails from companies whe...,remember the hundred emails from companies whe...
3,Today my pop-pop told me I was not “forced” to...,today my poppop told me i was not “forced” to ...
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,i did too and i also reported cancun cruz not ...


In [5]:
X = df['tweet_clean']
y = df['sarcastic']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Taille de train set : {len(X_train)}")
print(f"Taille de test set : {len(X_val)}")
print(f"\nDistribution dans train set :")
print(y_train.value_counts(normalize=True))
print(f"\nDistribution dans test set :")
print(y_val.value_counts(normalize=True))

Taille de train set : 2774
Taille de test set : 694

Distribution dans train set :
sarcastic
0    0.74982
1    0.25018
Name: proportion, dtype: float64

Distribution dans test set :
sarcastic
0    0.75072
1    0.24928
Name: proportion, dtype: float64


In [6]:
tfidf = TfidfVectorizer(
    max_features=5000,           
    min_df=2,                     
    max_df=0.8,                  
    ngram_range=(1, 2),          
    stop_words='english'      
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

print(f"Shape de X_train_tfidf : {X_train_tfidf.shape}")
print(f"Shape de X_val_tfidf : {X_val_tfidf.shape}")
print(f"\nNombre de features TF-IDF : {len(tfidf.get_feature_names_out())}")

Shape de X_train_tfidf : (2774, 3600)
Shape de X_val_tfidf : (694, 3600)

Nombre de features TF-IDF : 3600


In [None]:
feature_names = tfidf.get_feature_names_out()
print("Exemples de features :")
print(feature_names[:20])

Exemples de features :
['ability' 'able' 'able sleep' 'absolute' 'absolutely'
 'absolutely delicious' 'absolutely love' 'abt' 'abusive' 'ac' 'academic'
 'academic writing' 'accepted' 'access' 'accidentally' 'accomplished'
 'according' 'account' 'account going' 'achievement']


In [None]:
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)

y_val_pred = model.predict(X_val_tfidf)

print("\nAccuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))


Accuracy: 0.6325648414985591

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       521
           1       0.29      0.33      0.31       173

    accuracy                           0.63       694
   macro avg       0.53      0.53      0.53       694
weighted avg       0.65      0.63      0.64       694



In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128, padding=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy().flatten()

X_train_bert = np.array([get_bert_embedding(text) for text in X_train])
X_val_bert = np.array([get_bert_embedding(text) for text in X_val])

print(f"X_train_bert shape: {X_train_bert.shape}")

X_train_bert shape: (2774, 768)


In [None]:
model_bert = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model_bert.fit(X_train_bert, y_train)

y_val_pred_bert = model_bert.predict(X_val_bert)

print("\nAccuracy:", accuracy_score(y_val, y_val_pred_bert))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_bert))


Accuracy: 0.6210374639769453

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.67      0.73       521
           1       0.32      0.48      0.39       173

    accuracy                           0.62       694
   macro avg       0.56      0.57      0.56       694
weighted avg       0.68      0.62      0.64       694



In [None]:
rus = RandomUnderSampler(random_state=42)
X_train_tfidf_resampled, y_train_resampled = rus.fit_resample(X_train_tfidf, y_train)

print("Before undersampling:")
print(y_train.value_counts())
print("\nAfter undersampling:")
print(pd.Series(y_train_resampled).value_counts())

model_undersample = LogisticRegression(max_iter=1000, random_state=42)
model_undersample.fit(X_train_tfidf_resampled, y_train_resampled)

y_val_pred_undersample = model_undersample.predict(X_val_tfidf)

print("\nAccuracy:", accuracy_score(y_val, y_val_pred_undersample))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_undersample))

Before undersampling:
sarcastic
0    2080
1     694
Name: count, dtype: int64

After undersampling:
sarcastic
0    694
1    694
Name: count, dtype: int64

Accuracy: 0.5389048991354467

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.55      0.64       521
           1       0.27      0.51      0.36       173

    accuracy                           0.54       694
   macro avg       0.52      0.53      0.50       694
weighted avg       0.65      0.54      0.57       694



In [7]:
smote = SMOTE(random_state=42)
X_train_tfidf_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

print("Before SMOTE:")
print(y_train.value_counts())
print("\nAfter SMOTE:")
print(pd.Series(y_train_smote).value_counts())

model_smote = LogisticRegression(max_iter=1000, random_state=42)
model_smote.fit(X_train_tfidf_smote, y_train_smote)

y_val_pred_smote = model_smote.predict(X_val_tfidf)

print("\nAccuracy:", accuracy_score(y_val, y_val_pred_smote))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_smote))

Before SMOTE:
sarcastic
0    2080
1     694
Name: count, dtype: int64

After SMOTE:
sarcastic
1    2080
0    2080
Name: count, dtype: int64

Accuracy: 0.6282420749279539

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.71      0.74       521
           1       0.30      0.38      0.34       173

    accuracy                           0.63       694
   macro avg       0.54      0.55      0.54       694
weighted avg       0.66      0.63      0.64       694



In [12]:
smote = SMOTE(random_state=42)
X_train_bert_smote, y_train_bert_smote = smote.fit_resample(X_train_bert, y_train)

xgb_model = XGBClassifier(
    scale_pos_weight=3,
    max_depth=5,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
xgb_model.fit(X_train_bert_smote, y_train_bert_smote)

y_val_pred_xgb = xgb_model.predict(X_val_bert)

print("Accuracy:", accuracy_score(y_val, y_val_pred_xgb))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_xgb))

Accuracy: 0.6368876080691642

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.68      0.74       521
           1       0.35      0.52      0.42       173

    accuracy                           0.64       694
   macro avg       0.58      0.60      0.58       694
weighted avg       0.69      0.64      0.66       694

