# 1. Setup

In [1]:
import io
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import joblib
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download it_core_news_sm
!python -m spacy download es_core_news_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12.8 MB 4.4 MB/s eta 0:00:01
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting it-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.7.0/it_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[K     |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13.0 MB 7.6 MB/s eta 0:00:01
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
Collecting es-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_ne

# 2. Load data

In [2]:
train_es = pd.read_csv("train_es.csv", encoding="utf-8")
train_en = pd.read_csv("train_en.csv", encoding="utf-8")
train_it = pd.read_csv("train_it.csv", encoding="utf-8")
test_es = pd.read_csv("es_test.csv", encoding="utf-8")
test_en = pd.read_csv("en_test.csv", encoding="utf-8")
test_it = pd.read_csv("it_test.csv", encoding="utf-8")

# 3. Dataset Integration (Italian, English, Spanish)

We first merge the three monolingual datasets into a multilingual dataset (one for training and one for validation).
This allows us to train a unified model capable of handling input from different languages, increasing data size and improving model robustness.

In [3]:
#TRAINING SET

# 1. Adding missing 'bio' column to English dataset
train_en["bio"] = ""   # English dataset has no biography field

# 2. Select and reorder columns for IT and ES (they already have bio)
train_it = train_it[["id", "text", "bio", "label", "lang"]]
train_es = train_es[["id", "text", "bio", "label", "lang"]]

# 3. Select and reorder columns for EN (we just added bio)
train_en = train_en[["id", "text", "bio", "label", "lang"]]

# 4. Concatenate all datasets into a single multilingual dataset
train_multi = pd.concat([train_it, train_en, train_es], ignore_index=True)

# 5. Quick checks
print(train_multi.head())
print("\nShape:", train_multi.shape)
print("\nColumns:", train_multi.columns.tolist())
print("\nLabel distribution:")
print(train_multi["label"].value_counts(normalize=True))
train_multi.to_csv("train_Multi.csv")

#TEST SET

# 1. Adding missing 'bio' column to English dataset
test_en["bio"] = ""   # English dataset has no biography field

# 2. Select and reorder columns for IT and ES (they already have bio)
test_it = test_it[["id", "text", "bio", "lang"]]
test_es = test_es[["id", "text", "bio", "lang"]]

# 3. Select and reorder columns for EN (we just added bio)
test_en = test_en[["id", "text", "bio", "lang"]]

# 4. Concatenate all datasets into a single multilingual dataset
test_multi = pd.concat([test_it, test_en, test_es], ignore_index=True)

# 5. Quick checks
print(test_multi.head())
print("\nShape:", test_multi.shape)
print("\nColumns:", test_multi.columns.tolist())
test_multi.to_csv("test_Multi.csv")

        id                                               text  \
0  it_1231  La destra Italiana pur di non dire che loro od...   
1  it_1713  "Presupporre che tutti i bisessuali non sono m...   
2  it_1474  Se i diritti devono essere uguali, voglio che ...   
3    it_58  che poi molti uomini trans subiscono lesbofobi...   
4   it_511  Che poi √® l‚Äôetero medio come Pio e Amedeo che ...   

                                                 bio  label lang  
0  Il rispetto per il prossimo qualunque sia il s...      0   it  
1                   ùìïùì≤ùìµùìµùìÆùì≠ ùîÄùì≤ùìΩùì± ùìØùìæùìªùîÇ ùì™ùì∑ùì≠ ùìºùìΩùì™ùìªùìªùîÇ ùìÆùîÇùìÆùì≠      0   it  
2                       User Experience Designer URL      0   it  
3               no matter where i go, you're there ‚Ä¶      0   it  
4  T'appartengo ed io ci tengo \nE se prometto po...      0   it  

Shape: (2988, 5)

Columns: ['id', 'text', 'bio', 'label', 'lang']

Label distribution:
label
0    0.85676
1    0.14324
N

# 4. Concatenatenation of the text and the biography into a single field:

In [4]:
#TRAINING SET

train_multi["bio"] = train_multi["bio"].fillna("")
train_multi["text_full"] = train_multi["text"] + " " + train_multi["bio"]

#TEST SET

test_multi["bio"] = test_multi["bio"].fillna("")
test_multi["text_full"] = test_multi["text"] + " " + test_multi["bio"]

# 5. Multilingual Text Pre-processing

This section performs the full preprocessing pipeline on the multilingual datasets.  
The code loads the spaCy models for Italian, English, and Spanish, defines multilingual stopwords, and applies language-specific lemmatization. Before lemmatization, the text is cleaned by removing URLs, mentions, numbers, punctuation, emojis, and extra whitespace.  

In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords
import spacy

# 0. Loading spaCy models

nlp_en = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp_it = spacy.load("it_core_news_sm", disable=["ner", "parser"])
nlp_es = spacy.load("es_core_news_sm", disable=["ner", "parser"])


# 1. Downloading NLTK resources 

nltk.download('stopwords')


# 2. Multilingual stopwords 

stop_it = set(stopwords.words('italian'))
stop_en = set(stopwords.words('english'))
stop_es = set(stopwords.words('spanish'))
STOPWORDS = stop_it.union(stop_en).union(stop_es)


# 3. Lemmatization (stopwords removed *inside* this function)

def lemmatize_multilingual(text_full, lang, stopwords_set=STOPWORDS):
    if not isinstance(text_full, str) or text_full.strip() == "":
        return ""

    # choose spaCy model based on language
    if lang == "en":
        doc = nlp_en(text_full)
    elif lang == "it":
        doc = nlp_it(text_full)
    elif lang == "es":
        doc = nlp_es(text_full)
    else:
        # fallback to English if unknown
        doc = nlp_en(text_full)

    lemmas = []
    for token in doc:
        if token.is_space or token.is_punct:
            continue
        lemma = token.lemma_.lower()
        # filter stopwords
        if lemma and lemma not in stopwords_set:
            lemmas.append(lemma)

    return " ".join(lemmas)


# 4. Cleaning Functions

def remove_urls(text_full):
    return re.sub(r'https?://\S+|www\.\S+', '', text_full)

def remove_emojis(text_full):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002500-\U00002BEF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text_full)

def remove_mention(text_full):
    return re.sub(r'@[\w\-]+', '', text_full)

def remove_numbers(text_full):
    return re.sub(r'\d+', '', text_full)

def remove_punctuation(text_full, punctuation=None):
    if punctuation is None:
        punctuation = string.punctuation
    return text_full.translate(str.maketrans(punctuation, ' '*len(punctuation)))

def remove_extra_whitespace(text_full):
    return " ".join(text_full.split())

def remove_retweet(text_full):
    return re.sub(r'^RT\s+', '', text_full)


# 5. Unified preprocessing function

def preprocessing(text_full, lang):
    text_full = str(text_full)
    text_full = remove_emojis(text_full)
    text_full = remove_urls(text_full)
    text_full = remove_mention(text_full)
    text_full = remove_retweet(text_full)
    text_full = remove_numbers(text_full)
    text_full = remove_punctuation(text_full)

    # normalization
    text_full = text_full.lower()
    text_full = remove_extra_whitespace(text_full)

    # lemmatization 
    text_clean = lemmatize_multilingual(text_full, lang)

    return text_clean


# 6. Apply preprocessing to training and test sets

train_multi["text_clean"] = train_multi.apply(
    lambda row: preprocessing(row["text_full"], row["lang"]),
    axis=1
)

test_multi["text_clean"] = test_multi.apply(
    lambda row: preprocessing(row["text_full"], row["lang"]),
    axis=1
)

# Check
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

train_multi.head()
test_multi.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/giulia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,text,bio,lang,text_full,text_clean
0,it_406,@USER @USER Oggi avr√≤ di che parlare coi colleghi.. un etero analfabeta che conquista l attenzione di una checca alfabetizzata üòé mi raccomando vai a fare la quarta dose che forse ti aiuta a dimenticarmi. Ciao,,it,@USER @USER Oggi avr√≤ di che parlare coi colleghi.. un etero analfabeta che conquista l attenzione di una checca alfabetizzata üòé mi raccomando vai a fare la quarta dose che forse ti aiuta a dimenticarmi. Ciao,oggi avere parlare con il collega etero analfabeta conquistare attenzione checca alfabetizzato raccomare vai fare quarto doso forse aiutare dimenticare mi ciao
1,it_138,"@USER Il problema √® che c‚Äô√® tantissima omofobia anche tra noi froci. Basti pensare a quanti se ne fregano dei diritti della comunit√† Lbgt+. E a quanti votano per la destra omofoba, Lega, FdI.... #DDLZan sarebbe da approvare da tutto il Parlamento.... e invece...","Slave, passive, submissive. Available for Master(s), couples. Love to service strong minded men. Great at oral/rimming. Cerco a Roma Keyholder esperto",it,"@USER Il problema √® che c‚Äô√® tantissima omofobia anche tra noi froci. Basti pensare a quanti se ne fregano dei diritti della comunit√† Lbgt+. E a quanti votano per la destra omofoba, Lega, FdI.... #DDLZan sarebbe da approvare da tutto il Parlamento.... e invece... Slave, passive, submissive. Available for Master(s), couples. Love to service strong minded men. Great at oral/rimming. Cerco a Roma Keyholder esperto",problema essere essere omofobiare frocio bastare pensare fregare di il diritto di il comunit√† lbgt votare destra omofoba legare fdi ddlzan essere approvare parlamento invece slave passivo submissive available master couples love service strong minded men great oral rimming cercare roma keyholder esperto
2,it_1622,dua lipa bella hadid kendall jenner hunter schafer zendaya mi rendono debole e ancora pi√π lella.,ÀóÀèÀãyou bring me homeÀäÀó she / her ‚Ä∫‚Ä∫‚Ä∫‚Ä∫‚Ä∫,it,dua lipa bella hadid kendall jenner hunter schafer zendaya mi rendono debole e ancora pi√π lella. ÀóÀèÀãyou bring me homeÀäÀó she / her ‚Ä∫‚Ä∫‚Ä∫‚Ä∫‚Ä∫,dua lipa bello hadid kendall jenner hunter schafer zendaya rendere debole ancora lella ÀóÀèÀãyou bring homeÀäÀó
3,it_1401,"üè≥Ô∏è‚Äçüåàüè≥Ô∏è‚Äçüåàüè≥Ô∏è‚Äçüåà\nFatelo per lui, per tutte le vittime di omofobia e in ultimo per me che ""Frocio"" me l'hanno fatto uscire dalle orecchie quando andavo a scuola #tzvip",Cit - Allora scusate ma se io mi metto una bottiglia di San Pellegrino nel culo non querelo la San Pellegrino\nAcquario Ascendente Gemelli\nHe/Him üè≥Ô∏è‚Äçüåà,it,"üè≥Ô∏è‚Äçüåàüè≥Ô∏è‚Äçüåàüè≥Ô∏è‚Äçüåà\nFatelo per lui, per tutte le vittime di omofobia e in ultimo per me che ""Frocio"" me l'hanno fatto uscire dalle orecchie quando andavo a scuola #tzvip Cit - Allora scusate ma se io mi metto una bottiglia di San Pellegrino nel culo non querelo la San Pellegrino\nAcquario Ascendente Gemelli\nHe/Him üè≥Ô∏è‚Äçüåà",fatelo vittima omofobia ultimo frocio avere fare uscire da il orecchia quando andare scuola tzvip cit allora scusare mettere bottiglia san pellegrino in il culo querelo san pellegrino acquario ascendente gemello
4,it_807,"No Pride fisico, ma potete sempre dare uno sguardo alle mie magliettine frocine ‚ú® \n#PrideMonth2020 #redbubble \n\nURL","Mi piace disegnare, mi piacciono i videogiochi. üè≥Ô∏è‚Äçüåà Lui/Suo He/His ‚ú®Tshirt, Stickers, Prints üëïRedbubble ShURL",it,"No Pride fisico, ma potete sempre dare uno sguardo alle mie magliettine frocine ‚ú® \n#PrideMonth2020 #redbubble \n\nURL Mi piace disegnare, mi piacciono i videogiochi. üè≥Ô∏è‚Äçüåà Lui/Suo He/His ‚ú®Tshirt, Stickers, Prints üëïRedbubble ShURL",prido fisico potere sempre dare sguardo a il magliettina frocino pridemonth redbubble url piacere disegnare piacere videogioco evere tshirt stickers prints redbubble shurl


# 5. Train/Validation Split 80% ‚Äì 20%

Before training and evaluating our multilingual models, we split the combined training dataset into a training set and a validation set. This separation allows us to tune hyperparameters, compare different models, and assess performance on unseen data without ever touching the official test set. Stratification ensures that the class imbalance is preserved across both splits, making the evaluation reliable and representative.

In [6]:
# Importing the function
from sklearn.model_selection import train_test_split

# Splitting the dataset
train_df, val_df = train_test_split(
    train_multi,
    test_size=0.2,               # 20% for validation
    stratify=train_multi["label"],  # <-- CORRECT
    random_state=42
)

# Printing the sizes of the splits
print("Train size:", len(train_df))
print("Val size:", len(val_df))

# Checking label distribution in each split
print("\nTrain label distribution:")
print(train_df["label"].value_counts(normalize=True))

print("\nVal label distribution:")
print(val_df["label"].value_counts(normalize=True))


Train size: 2390
Val size: 598

Train label distribution:
label
0    0.856904
1    0.143096
Name: proportion, dtype: float64

Val label distribution:
label
0    0.856187
1    0.143813
Name: proportion, dtype: float64


The training‚Äìvalidation split was successful. The dataset was divided into 2390 training samples and 598 validation samples, keeping exactly the same class imbalance in both sets (~85.7% class 0 and ~14.3% class 1).
This confirms that stratification worked correctly, ensuring that the validation set is representative of the real label distribution and suitable for reliable model evaluation.

# 6. Text Representation

## BOW

The Bag-of-Words represents each document as a vector counting how many times each word (or n-gram) appears. It ignores grammar and word order, focusing only on frequency. This makes it simple and effective for text classification.

In [7]:
text_preprocessed_train = train_df["text_clean"]
text_preprocessed_val   = val_df["text_clean"]

y_train = train_df["label"]
y_val   = val_df["label"]

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import joblib


# Creating the BOW vectorizer 

vectorizer_bow = CountVectorizer(
    ngram_range=(1,3),      # uni-grams, bi-grams, tri-grams
    max_features=10000      # the most frequent 10.000 n-grams
)


# Fitting on the train set, transformation on train and validation

X_train_bow = vectorizer_bow.fit_transform(text_preprocessed_train).toarray()
X_val_bow   = vectorizer_bow.transform(text_preprocessed_val).toarray()


# dimentions' verification

print("Train BoW shape:", X_train_bow.shape)
print("Val BoW shape:",   X_val_bow.shape)

# Saving the BoW matrices 
joblib.dump(X_train_bow, 'processed_train_bow.save')
joblib.dump(X_val_bow,   'processed_val_bow.save')

Train BoW shape: (2390, 10000)
Val BoW shape: (598, 10000)


['processed_val_bow.save']

The BoW transformation ran successfully.
- Training matrix: 2390 samples √ó 10,000 features
- Validation matrix: 598 samples √ó 10,000 features

This confirms that the vectorizer extracted the 10,000 most frequent multilingual n-grams and applied them consistently to both splits.
Your data is now correctly vectorized and ready for training classification models.

## TF-IDF

TF‚ÄìIDF (Term Frequency‚ÄìInverse Document Frequency) extends BoW by weighting each word according to how important it is in the document relative to the entire corpus. Common words receive lower weights, while distinctive words receive higher weights. This helps highlight more informative terms and often improves performance in text classification tasks.

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# 1. Preprocessed texts and labels from the split datasets
text_preprocessed_train = train_df["text_clean"]
text_preprocessed_val   = val_df["text_clean"]

y_train = train_df["label"]
y_val   = val_df["label"]

# 2. Create the TF-IDF vectorizer (multilingual n-grams)
vectorizer_tfidf = TfidfVectorizer(
    ngram_range=(1, 3),    # unigrams, bigrams, trigrams
    max_features=10000,    # 10k most informative n-grams
    lowercase=True
)

# 3. Fit on the training set, transform both train and validation
X_train_tfidf = vectorizer_tfidf.fit_transform(text_preprocessed_train).toarray()
X_val_tfidf   = vectorizer_tfidf.transform(text_preprocessed_val).toarray()

# 4. Check dimensions
print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Val TF-IDF shape:",   X_val_tfidf.shape)

# 5. Save TF-IDF matrices and vectorizer
joblib.dump(X_train_tfidf, "processed_train_tfidf.save")
joblib.dump(X_val_tfidf,   "processed_val_tfidf.save")
joblib.dump(vectorizer_tfidf, "tfidf_vectorizer.save")


Train TF-IDF shape: (2390, 10000)
Val TF-IDF shape: (598, 10000)


['tfidf_vectorizer.save']

The TF-IDF vectorizer successfully transformed the multilingual training and validation sets into numerical feature matrices of size (2390, 10000) and (598, 10000).
This means the model extracted the 10,000 most informative n-grams (unigrams, bigrams, trigrams) from the training corpus and applied the same vocabulary to the validation set.

# 7. Text Classification

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

# 1. Encode labels (even if they are already 0/1, this keeps the pipeline consistent)

labels_train = train_df["label"]
labels_val   = val_df["label"]

# Initialize LabelEncoder and fit ONLY on the training labels
encoder = LabelEncoder()
encoder.fit(labels_train)

# Transform labels to numeric format
y_train = encoder.transform(labels_train)
y_val   = encoder.transform(labels_val)

# Save the encoder for later use (e.g., on the test predictions)
joblib.dump(encoder, "label_encoder.save")

print("Examples of original labels:", labels_train.unique()[:5])
print("Examples of encoded labels:", y_train[:5])

# 2. Load BoW / TF-IDF matrices previously saved

# --- BoW ---
X_train_bow = joblib.load("processed_train_bow.save")
X_val_bow   = joblib.load("processed_val_bow.save")

print("X_train_bow shape:", X_train_bow.shape)
print("X_val_bow shape:  ", X_val_bow.shape)

# --- TF-IDF ---
X_train_tfidf = joblib.load("processed_train_tfidf.save")
X_val_tfidf   = joblib.load("processed_val_tfidf.save")

print("X_train_tfidf shape:", X_train_tfidf.shape)
print("X_val_tfidf shape:  ", X_val_tfidf.shape)

Examples of original labels: [0 1]
Examples of encoded labels: [0 0 0 0 1]
X_train_bow shape: (2390, 10000)
X_val_bow shape:   (598, 10000)
X_train_tfidf shape: (2390, 10000)
X_val_tfidf shape:   (598, 10000)


BoW and TF-IDF matrices loaded successfully the shapes (2390, 10000) for training and (598, 10000) for validation
show that:

- You have 2390 training samples and 598 validation samples
- Each sample is represented by a 10,000-dimensional feature vector
- Both vectorizers applied 1‚Äì3-grams correctly



## Support Vector Machines (SVM)

This cell trains two LinearSVC models using the Bag-of-Words and TF‚ÄìIDF representations without applying oversampling. The parameter `class_weight='balanced'` compensates for the strong class imbalance by increasing the contribution of minority-class errors during training.

In [10]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score


# 1. Linear SVM on Bag-of-Words

clf_bow = LinearSVC(C=0.001, class_weight='balanced', random_state=42)
clf_bow.fit(X_train_bow, y_train)

# Predictions
preds_bow = clf_bow.predict(X_val_bow)

# Macro F1
macro_f1_bow = f1_score(y_val, preds_bow, average='macro')

print("RESULTS: Bag-of-Words + LinearSVC")
print(classification_report(y_val, preds_bow, digits=3))
print("Accuracy:", accuracy_score(y_val, preds_bow))
print("Macro F1:", round(macro_f1_bow, 4))
print("Confusion matrix:\n", confusion_matrix(y_val, preds_bow))



# 2. Linear SVM on TF-IDF

clf_tfidf = LinearSVC(C=0.001, class_weight='balanced', random_state=42)
clf_tfidf.fit(X_train_tfidf, y_train)

# Predictions
preds_tfidf = clf_tfidf.predict(X_val_tfidf)

# Macro F1
macro_f1_tfidf = f1_score(y_val, preds_tfidf, average='macro')

print("RESULTS: TF-IDF + LinearSVC")
print(classification_report(y_val, preds_tfidf, digits=3))
print("Accuracy:", accuracy_score(y_val, preds_tfidf))
print("Macro F1:", round(macro_f1_tfidf, 4))
print("Confusion matrix:\n", confusion_matrix(y_val, preds_tfidf))


RESULTS: Bag-of-Words + LinearSVC
              precision    recall  f1-score   support

           0      0.966     0.787     0.868       512
           1      0.398     0.837     0.539        86

    accuracy                          0.794       598
   macro avg      0.682     0.812     0.703       598
weighted avg      0.885     0.794     0.820       598

Accuracy: 0.794314381270903
Macro F1: 0.7035
Confusion matrix:
 [[403 109]
 [ 14  72]]
RESULTS: TF-IDF + LinearSVC
              precision    recall  f1-score   support

           0      0.958     0.809     0.877       512
           1      0.410     0.791     0.540        86

    accuracy                          0.806       598
   macro avg      0.684     0.800     0.708       598
weighted avg      0.879     0.806     0.829       598

Accuracy: 0.8060200668896321
Macro F1: 0.7084
Confusion matrix:
 [[414  98]
 [ 18  68]]




Both vectorizations perform similarly with LinearSVC. TF-IDF achieves a slightly higher macro F1 (0.7084) than Bag-of-Words (0.7035). 

The consideration of the biography brings better results compared to those achived with the same models but using only the tweet.

## Logistic Distribution

This cell trains two Logistic Regression models‚Äîone using Bag-of-Words and the other using TF‚ÄìIDF. The parameter `class_weight='balanced'` is used to mitigate the severe label imbalance by assigning a higher weight to the minority class during training.

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score


# 1. LOGISTIC REGRESSION on Bag-of-Words

lr_bow = LogisticRegression(
    penalty='l2',
    max_iter=500,
    C=1.0,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

lr_bow.fit(X_train_bow, y_train)

# Predictions
preds_bow = lr_bow.predict(X_val_bow)

# Metrics
print("RESULTS: Logistic Regression (Bag-of-Words)")
print(classification_report(y_val, preds_bow))
print("Accuracy:", accuracy_score(y_val, preds_bow))
print("Macro F1:", f1_score(y_val, preds_bow, average='macro'))


# 2. LOGISTIC REGRESSION on TF-IDF

lr_tfidf = LogisticRegression(
    penalty='l2',
    max_iter=500,
    C=1.0,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

lr_tfidf.fit(X_train_tfidf, y_train)

# Predictions
preds_tfidf = lr_tfidf.predict(X_val_tfidf)

# Metrics
print("\nRESULTS: Logistic Regression (TF-IDF)")
print(classification_report(y_val, preds_tfidf))
print("Accuracy:", accuracy_score(y_val, preds_tfidf))
print("Macro F1:", f1_score(y_val, preds_tfidf, average='macro'))


RESULTS: Logistic Regression (Bag-of-Words)
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       512
           1       0.62      0.66      0.64        86

    accuracy                           0.89       598
   macro avg       0.78      0.80      0.79       598
weighted avg       0.90      0.89      0.89       598

Accuracy: 0.8929765886287625
Macro F1: 0.7887905344252886

RESULTS: Logistic Regression (TF-IDF)
              precision    recall  f1-score   support

           0       0.94      0.90      0.92       512
           1       0.53      0.65      0.59        86

    accuracy                           0.87       598
   macro avg       0.74      0.78      0.75       598
weighted avg       0.88      0.87      0.87       598

Accuracy: 0.8678929765886287
Macro F1: 0.7538902346904222


Both Bag-of-Words and TF-IDF representations produce strong performance with Logistic Regression in the multilingual setting. However, Bag-of-Words clearly performs better overall:

- Higher precision, recall, and F1-score for the minority class (label 1): BoW F1 for class 1 ‚âà 0.64 vs. TF-IDF ‚âà 0.59
- Higher Macro F1: BoW Macro F1 = 0.789; TF-IDF Macro F1 = 0.754
- Higher overall accuracy: BoW accuracy ‚âà 0.893; TF-IDF accuracy ‚âà 0.868

This indicates that BoW better captures the multilingual discriminatory patterns in this dataset. It is especially more effective at recalling and classifying the minority (hate speech) class, which is the key challenge of the task.

Best model so far: Logistic Regression + Bag-of-Words, which achieves the strongest balance between accuracy and fairness across classes.

# 8. Evaluation of the best performing model: BoW + Logistic Regression

In [12]:
# 1. Re‚Äìtrain BoW + Logistic Regression on all labeled data

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import joblib

# Full text and labels
text_full = train_multi["text_clean"]
y_full    = train_multi["label"].values

# Final BoW vectorizer (same config as validation stage)
vectorizer_bow_final = CountVectorizer(
    ngram_range=(1,3),
    max_features=10000
)

# Fit on all labeled data
X_full_bow = vectorizer_bow_final.fit_transform(text_full)

print("Full train BoW shape:", X_full_bow.shape)

# Save vectorizer
joblib.dump(vectorizer_bow_final, "vectorizer_bow_final.save")


Full train BoW shape: (2988, 10000)


['vectorizer_bow_final.save']

In [13]:
# 2. Train of the best model on all the data

lr_bow_final = LogisticRegression(
    penalty='l2',
    max_iter=500,
    C=1,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

lr_bow_final.fit(X_full_bow, y_full)

# Save model
joblib.dump(lr_bow_final, "lr_bow_final.save")


['lr_bow_final.save']

In [14]:
# 3. Trasformation of the test set and predictions

# Preprocessed test text
X_test_text = test_multi["text_clean"]

# Load vectorizer
vectorizer = joblib.load("vectorizer_bow_final.save")

# Transform test
X_test_bow = vectorizer.transform(X_test_text)

print("Test BoW shape:", X_test_bow.shape)

# Load model
best_model = joblib.load("lr_bow_final.save")

# Predict labels
preds_test = best_model.predict(X_test_bow)


Test BoW shape: (1995, 10000)


In [17]:
submissionTaskB = pd.DataFrame({
    "id": test_multi["id"],
    "label": preds_test,
    "lang": test_multi["lang"]
})

submissionTaskB.to_csv("submission_taskB.tsv", sep="\t", index=False)
print("Submission file created: submission_taskB.tsv")
submissionTaskB


Submission file created: submission_taskB.tsv


Unnamed: 0,id,label,lang
0,it_406,1,it
1,it_138,0,it
2,it_1622,1,it
3,it_1401,0,it
4,it_807,1,it
5,it_566,1,it
6,it_1344,0,it
7,it_732,0,it
8,it_347,0,it
9,it_56,0,it
