<h1 align="center">Lab 2:  Sexism Identification in Twitter</h1>
<h2 align="center">Session 1. Machine Learning and Feature Engineering</h2>

<h3 style="display:block; margin-top:5px;" align="center">Natural Language and Information Retrieval</h3>
<h3 style="display:block; margin-top:5px;" align="center">Degree in Data Science</h3>
<h3 style="display:block; margin-top:5px;" align="center">2024-2025</h3>    
<h3 style="display:block; margin-top:5px;" align="center">ETSInf. Universitat Politècnica de València</h3>
<br>

### Put your names here

- Kacper Multan

In [2]:
# Reading the entire dataset for both languages and considering only the hard labels. In this lab we do not address the sexism identification task from a Learning with Disagreement (LwD) perspective.

from readerEXIST2025 import EXISTReader
import re
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import gensim.downloader as api
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.keyedvectors import KeyedVectors
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


reader_train = EXISTReader("EXIST_2025_Dataset_V0.2/EXIST2025_training.json")
reader_dev = EXISTReader("EXIST_2025_Dataset_V0.2/EXIST2025_dev.json")

EnTrainTask1, EnDevTask1 = reader_train.get(lang="EN", subtask="1"), reader_dev.get(lang="EN", subtask="1")
EnTrainTask2, EnDevTask2 = reader_train.get(lang="EN", subtask="2"), reader_dev.get(lang="EN", subtask="2")

SpTrainTask1, SpDevTask1 = reader_train.get(lang="ES", subtask="1"), reader_dev.get(lang="ES", subtask="1")
SpTrainTask2, SpDevTask2 = reader_train.get(lang="ES", subtask="2"), reader_dev.get(lang="ES", subtask="2")

In [3]:
web_re = re.compile(r"https?:\/\/[^\s]+", re.U)
user_re = re.compile(r"(@\w+\-?(?:\w+)?)", re.U)
hashtag_re = re.compile(r"(#\w+\-?(?:\w+)?)", re.U)

stopw = {
    "english": nltk.corpus.stopwords.words("english"),
    "spanish": nltk.corpus.stopwords.words("spanish")
}

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/zzzdream/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Preprocessing - tokenization, removal of stopwords, lowercasing

In [4]:
def preprocess_text(text, lang):
    def preprocessing(text):
        text = text.lower()
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)
        tokens = text.split()
        tokens = [word for word in tokens if word not in nltk.corpus.stopwords.words(lang)]
        return " ".join(tokens)

    return text.apply(preprocessing)

In [5]:
def tokenize(text_list, lang="english"):
    def preprocess(text):
        text = web_re.sub("", text)
        text = user_re.sub("", text)
        text = hashtag_re.sub("", text)
        text = text.lower()
        return text

    token_list = []
    for text in text_list:
        text = preprocess(text)
        tokens = word_tokenize(text, language=lang)
        tokens = [word for word in tokens if word.isalnum() and word not in stopw[lang]]
        token_list.append(tokens)
    return token_list

### Text representation - static embeddings

In [6]:
def get_sentence_vector(tokens, model, dim=300):
    zero_vec = np.zeros(dim)
    avg_vec = np.zeros(dim)
    total_words = 0
    for word in tokens:
        if word in model:
            avg_vec += model[word]
            total_words += 1

    if total_words == 0:
        return zero_vec
    return avg_vec / total_words

def gensim_sentence_rep(model, tokens_list):
    embeddings = []
    for tokens in tokens_list:
        embeddings.append(get_sentence_vector(tokens, model))
    return embeddings

# ENGLISH

In [7]:
binary_mapping = {"NO": 0, "YES": 1}
y_train_en_task1 = EnTrainTask1[2].map(binary_mapping)
y_test_en_task1 = EnDevTask1[2].map(binary_mapping)

multi_mapping = {"DIRECT": 0, "REPORTED": 1, "JUDGEMENTAL": 2}
y_train_en_task2 = EnTrainTask2[2].map(multi_mapping)
y_test_en_task2 = EnDevTask2[2].map(multi_mapping)

## Preprocessing

### Preprocessing - tokenization, removal of stopwords, special characters and lowercasing

In [8]:
en_training_processed_text_task1 = preprocess_text(EnTrainTask1[1], "english")
en_test_processed_text_task1 = preprocess_text(EnDevTask1[1], "english")

en_training_processed_text_task2 = preprocess_text(EnTrainTask2[1], "english")
en_test_processed_text_task2 = preprocess_text(EnDevTask2[1], "english")

In [9]:
tokenized_text_train_en_task1 = tokenize(EnTrainTask1[1], "english")
tokenized_text_test_en_task1 = tokenize(EnDevTask1[1], "english")

tokenized_text_train_en_task2 = tokenize(EnTrainTask2[1], "english")
tokenized_text_test_en_task2 = tokenize(EnDevTask2[1], "english")

### Text representation - traditional

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_en_tfidf_task1 = tfidf_vectorizer.fit_transform(en_training_processed_text_task1)
X_test_en_tfidf_task1 = tfidf_vectorizer.transform(en_test_processed_text_task1)

X_train_en_tfidf_task2 = tfidf_vectorizer.fit_transform(en_training_processed_text_task2)
X_test_en_tfidf_task2 = tfidf_vectorizer.transform(en_test_processed_text_task2)

### Text representation - static word embeddings

In [11]:
glove_model = api.load("glove-wiki-gigaword-300")

In [12]:
X_train_embeddings_en_task1 = gensim_sentence_rep(glove_model, tokenized_text_train_en_task1)
X_test_embeddings_en_task1 = gensim_sentence_rep(glove_model, tokenized_text_test_en_task1)

X_train_embeddings_en_task2 = gensim_sentence_rep(glove_model, tokenized_text_train_en_task2)
X_test_embeddings_en_task2 = gensim_sentence_rep(glove_model, tokenized_text_test_en_task2)

## Learning Models

### Decision tree classifier

In [13]:
dt_tfidf = DecisionTreeClassifier()
dt_tfidf.fit(X_train_en_tfidf_task1, y_train_en_task1)
en_task1_tree_y_predicted_traditional = dt_tfidf.predict(X_test_en_tfidf_task1)

dt_tfidf = DecisionTreeClassifier()
dt_tfidf.fit(X_train_en_tfidf_task2, y_train_en_task2)
en_task2_tree_y_predicted_traditional = dt_tfidf.predict(X_test_en_tfidf_task2)

In [14]:
dt_embeddings = DecisionTreeClassifier()
dt_embeddings.fit(X_train_embeddings_en_task1, y_train_en_task1)
en_task1_tree_y_predicted_embeddings = dt_embeddings.predict(X_test_embeddings_en_task1)

dt_embeddings = DecisionTreeClassifier()
dt_embeddings.fit(X_train_embeddings_en_task2, y_train_en_task2)
en_task2_tree_y_predicted_embeddings = dt_embeddings.predict(X_test_embeddings_en_task2)

### Multilayer Perceptron

In [44]:
from tensorflow import keras

def build_mlp_model(input_dim, num_classes=2):
    model = keras.models.Sequential([
        keras.layers.Dense(128, activation='relu', input_shape=(input_dim,)),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(num_classes if num_classes > 2 else 1, activation='softmax' if num_classes > 2 else 'sigmoid')
    ])

    loss = 'categorical_crossentropy' if num_classes > 2 else 'binary_crossentropy'

    model.compile(optimizer='adam',
                  loss=loss,
                  metrics=['accuracy'])

    return model

def train_classifier(train_embeddings, y_train, test_embeddings, y_test, multiclass=False):
    num_classes = len(set(y_train))

    if multiclass:
        y_train = keras.utils.to_categorical(y_train, num_classes)
        y_test = keras.utils.to_categorical(y_test, num_classes)

    model = build_mlp_model(input_dim=len(train_embeddings[0]), num_classes=num_classes)
    model.fit(train_embeddings, y_train, epochs=20, batch_size=8, validation_data=(test_embeddings, y_test))

    y_pred_probs = model.predict(test_embeddings)

    return y_pred_probs

In [45]:
en_task1_mlp_y_predicted = train_classifier(np.array(X_train_embeddings_en_task1), np.array(y_train_en_task1), np.array(X_test_embeddings_en_task1), np.array(y_test_en_task1))
en_task2_mlp_y_predicted = train_classifier(np.array(X_train_embeddings_en_task2), np.array(y_train_en_task2), np.array(X_test_embeddings_en_task2), np.array(y_test_en_task2), True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Random Forest - Ensemble of Decision Trees - Voting

In [17]:
rf_tfidf = RandomForestClassifier(n_estimators=50, random_state=42)
rf_tfidf.fit(X_train_en_tfidf_task1, y_train_en_task1)
en_task1_forest_y_predicted_traditional = rf_tfidf.predict(X_test_en_tfidf_task1)

rf_tfidf = RandomForestClassifier(n_estimators=50, random_state=42)
rf_tfidf.fit(X_train_en_tfidf_task2, y_train_en_task2)
en_task2_forest_y_predicted_traditional = rf_tfidf.predict(X_test_en_tfidf_task2)

### Stacking - Multilayer Perceptron + Decision Tree + SVM

In [18]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=15, random_state=42)
svm_model = SVC(kernel='rbf', probability=True, random_state=42)

stacked_model = StackingClassifier(
    estimators=[('dt', dt_model), ('mlp', mlp_model), ('svm', svm_model)],
    final_estimator=LogisticRegression()
)

stacked_model.fit(X_train_en_tfidf_task1, y_train_en_task1)
en_task1_stacking_y_predicted_traditional = stacked_model.predict(X_test_en_tfidf_task1)




In [19]:
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=15, random_state=42)
svm_model = SVC(kernel='rbf', probability=True, random_state=42)

stacked_model = StackingClassifier(
    estimators=[('dt', dt_model), ('mlp', mlp_model), ('svm', svm_model)],
    final_estimator=LogisticRegression()
)

stacked_model.fit(X_train_en_tfidf_task2, y_train_en_task2)
en_task2_stacking_y_predicted_traditional = stacked_model.predict(X_test_en_tfidf_task2)



## Show Results

In [20]:
def show_results(y_predicted, y_test, model_type, multiclass=False, probabilities=False):
    if multiclass:
        if probabilities:
            y_predicted = y_predicted.argmax(axis=1)
        report = f1_score(y_test, y_predicted, average='macro', zero_division=0)
    else:
        y_predicted = (y_predicted > 0.5).astype(int)
        report = f1_score(y_test, y_predicted, pos_label=1, zero_division=0)

    print(model_type)
    print("------------------------\n")
    print("Multiclass Classification Report (DIRECT, REPORTED, JUDGEMENTAL)" if multiclass else "Binary Classification Report (Sexist vs. Non-Sexist)")
    print('F1 score: ', report, "\n")

In [48]:
show_results(en_task1_mlp_y_predicted, y_test_en_task1, "Multilayer Perceptron - Static Embeddings")
show_results(en_task2_mlp_y_predicted, y_test_en_task2, "Multilayer Perceptron - Static Embeddings", True, True)

show_results(en_task1_tree_y_predicted_traditional, y_test_en_task1, "Decision Tree Classifier - Traditional")
show_results(en_task2_tree_y_predicted_traditional, y_test_en_task2, "Decision Tree Classifier - Traditional", True)

show_results(en_task1_tree_y_predicted_embeddings, y_test_en_task1, "Decision Tree Classifier - Static Embeddings")
show_results(en_task2_tree_y_predicted_embeddings, y_test_en_task2, "Decision Tree Classifier - Static Embeddings", True)

show_results(en_task1_forest_y_predicted_traditional, y_test_en_task1, "Random Forest Classifier - Traditional")
show_results(en_task2_forest_y_predicted_traditional, y_test_en_task2, "Random Forest Classifier - Traditional", True)

show_results(en_task1_stacking_y_predicted_traditional, y_test_en_task1, "Stacking - MLP + Decision Tree + SVM - Traditional")
show_results(en_task2_stacking_y_predicted_traditional, y_test_en_task2, "Stacking - MLP + Decision Tree - Traditional", True)

Multilayer Perceptron - Static Embeddings
------------------------

Binary Classification Report (Sexist vs. Non-Sexist)
F1 score:  0.7087912087912088 

Multilayer Perceptron - Static Embeddings
------------------------

Multiclass Classification Report (DIRECT, REPORTED, JUDGEMENTAL)
F1 score:  0.4610975317893448 

Decision Tree Classifier - Traditional
------------------------

Binary Classification Report (Sexist vs. Non-Sexist)
F1 score:  0.6997389033942558 

Decision Tree Classifier - Traditional
------------------------

Multiclass Classification Report (DIRECT, REPORTED, JUDGEMENTAL)
F1 score:  0.38909284384144716 

Decision Tree Classifier - Static Embeddings
------------------------

Binary Classification Report (Sexist vs. Non-Sexist)
F1 score:  0.5570291777188329 

Decision Tree Classifier - Static Embeddings
------------------------

Multiclass Classification Report (DIRECT, REPORTED, JUDGEMENTAL)
F1 score:  0.3665978829913256 

Random Forest Classifier - Traditional
------

# SPANISH

In [22]:
binary_mapping = {"NO": 0, "YES": 1}
y_train_sp_task1 = SpTrainTask1[2].map(binary_mapping)
y_test_sp_task1 = SpDevTask1[2].map(binary_mapping)

multi_mapping = {"DIRECT": 0, "REPORTED": 1, "JUDGEMENTAL": 2}
y_train_sp_task2 = SpTrainTask2[2].map(multi_mapping)
y_test_sp_task2 = SpDevTask2[2].map(multi_mapping)

## Preprocessing

In [23]:
sp_training_processed_text_task1 = preprocess_text(SpTrainTask1[1], "spanish")
sp_test_processed_text_task1 = preprocess_text(SpDevTask1[1], "spanish")

sp_training_processed_text_task2 = preprocess_text(SpTrainTask2[1], "spanish")
sp_test_processed_text_task2 = preprocess_text(SpDevTask2[1], "spanish")

In [24]:
tokenized_text_train_sp_task1 = tokenize(SpTrainTask1[1], "spanish")
tokenized_text_test_sp_task1 = tokenize(SpDevTask1[1], "spanish")

tokenized_text_train_sp_task2 = tokenize(SpTrainTask2[1], "spanish")
tokenized_text_test_sp_task2 = tokenize(SpDevTask2[1], "spanish")

## Tweet representations (Feature extraction)

In [25]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_sp_tfidf_task1 = tfidf_vectorizer.fit_transform(sp_training_processed_text_task1)
X_test_sp_tfidf_task1 = tfidf_vectorizer.transform(sp_test_processed_text_task1)

X_train_sp_tfidf_task2 = tfidf_vectorizer.fit_transform(sp_training_processed_text_task2)
X_test_sp_tfidf_task2 = tfidf_vectorizer.transform(sp_test_processed_text_task2)

In [26]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz

--2025-03-25 15:46:25--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.154.41.96, 18.154.41.57, 18.154.41.8, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.154.41.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz.2’


2025-03-25 15:47:01 (36.0 MB/s) - ‘cc.en.300.vec.gz.2’ saved [1325960915/1325960915]



In [27]:
fasttext_sp_model = KeyedVectors.load_word2vec_format("cc.en.300.vec.gz", binary=False)

In [28]:
X_train_embeddings_sp_task1 = gensim_sentence_rep(fasttext_sp_model, tokenized_text_train_sp_task1)
X_test_embeddings_sp_task1 = gensim_sentence_rep(fasttext_sp_model, tokenized_text_test_sp_task1)

X_train_embeddings_sp_task2 = gensim_sentence_rep(fasttext_sp_model, tokenized_text_train_sp_task2)
X_test_embeddings_sp_task2 = gensim_sentence_rep(fasttext_sp_model, tokenized_text_test_sp_task2)

## Learning Models

In [29]:
dt_tfidf = DecisionTreeClassifier()
dt_tfidf.fit(X_train_sp_tfidf_task1, y_train_sp_task1)
sp_task1_tree_y_predicted_traditional = dt_tfidf.predict(X_test_sp_tfidf_task1)

dt_tfidf = DecisionTreeClassifier()
dt_tfidf.fit(X_train_sp_tfidf_task2, y_train_sp_task2)
sp_task2_tree_y_predicted_traditional = dt_tfidf.predict(X_test_sp_tfidf_task2)

dt_embeddings = DecisionTreeClassifier()
dt_embeddings.fit(X_train_embeddings_sp_task1, y_train_sp_task1)
sp_task1_tree_y_predicted_embeddings = dt_embeddings.predict(X_test_embeddings_sp_task1)

dt_embeddings = DecisionTreeClassifier()
dt_embeddings.fit(X_train_embeddings_sp_task2, y_train_sp_task2)
sp_task2_tree_y_predicted_embeddings = dt_embeddings.predict(X_test_embeddings_sp_task2)

In [30]:
sp_task1_mlp_y_predicted = train_classifier(np.array(X_train_embeddings_sp_task1), np.array(y_train_sp_task1), np.array(X_test_embeddings_sp_task1), np.array(y_test_sp_task1))
sp_task2_mlp_y_predicted = train_classifier(np.array(X_train_embeddings_sp_task2), np.array(y_train_sp_task2), np.array(X_test_embeddings_sp_task2), np.array(y_test_sp_task2), True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [31]:
rf_tfidf = RandomForestClassifier(n_estimators=50, random_state=42)
rf_tfidf.fit(X_train_sp_tfidf_task1, y_train_sp_task1)
sp_task1_forest_y_predicted_traditional = rf_tfidf.predict(X_test_sp_tfidf_task1)

rf_tfidf = RandomForestClassifier(n_estimators=50, random_state=42)
rf_tfidf.fit(X_train_sp_tfidf_task2, y_train_sp_task2)
sp_task2_forest_y_predicted_traditional = rf_tfidf.predict(X_test_sp_tfidf_task2)

In [32]:
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=15, random_state=42)
svm_model = SVC(kernel='rbf', probability=True, random_state=42)

stacked_model1 = StackingClassifier(
    estimators=[('dt', dt_model), ('mlp', mlp_model), ('svm', svm_model)],
    final_estimator=LogisticRegression()
)

stacked_model1.fit(X_train_embeddings_sp_task1, y_train_sp_task1)
sp_task1_stacking_y_predicted_traditional = stacked_model1.predict(X_test_embeddings_sp_task1)

stacked_model2 = StackingClassifier(
    estimators=[('dt', dt_model), ('mlp', mlp_model), ('svm', svm_model)],
    final_estimator=LogisticRegression()
)

stacked_model2.fit(X_train_embeddings_sp_task2, y_train_sp_task2)
sp_task2_stacking_y_predicted_traditional = stacked_model1.predict(X_test_embeddings_sp_task2)



## Show Results

In [33]:
show_results(sp_task1_mlp_y_predicted, y_test_sp_task1, "Multilayer Perceptron - Static Embeddings")
show_results(sp_task2_mlp_y_predicted, y_test_sp_task2, "Multilayer Perceptron - Static Embeddings", True, True)

show_results(sp_task1_tree_y_predicted_traditional, y_test_sp_task1, "Decision Tree Classifier - Traditional")
show_results(sp_task2_tree_y_predicted_traditional, y_test_sp_task2, "Decision Tree Classifier - Traditional", True)

show_results(sp_task1_tree_y_predicted_embeddings, y_test_sp_task1, "Decision Tree Classifier - Static Embeddings")
show_results(sp_task2_tree_y_predicted_embeddings, y_test_sp_task2, "Decision Tree Classifier - Static Embeddings", True)

show_results(sp_task1_forest_y_predicted_traditional, y_test_sp_task1, "Random Forest Classifier - Traditional")
show_results(sp_task2_forest_y_predicted_traditional, y_test_sp_task2, "Random Forest Classifier - Traditional", True)

show_results(sp_task1_stacking_y_predicted_traditional, y_test_sp_task1, "Stacking - Decision Tree + MLP + SVM")
show_results(sp_task2_stacking_y_predicted_traditional, y_test_sp_task2, "Stacking - Decision Tree + MLP + SVM", True)

Multilayer Perceptron - Static Embeddings
------------------------

Binary Classification Report (Sexist vs. Non-Sexist)
F1 score:  0.7145557655954632 

Multilayer Perceptron - Static Embeddings
------------------------

Multiclass Classification Report (DIRECT, REPORTED, JUDGEMENTAL)
F1 score:  0.4241071428571428 

Decision Tree Classifier - Traditional
------------------------

Binary Classification Report (Sexist vs. Non-Sexist)
F1 score:  0.706959706959707 

Decision Tree Classifier - Traditional
------------------------

Multiclass Classification Report (DIRECT, REPORTED, JUDGEMENTAL)
F1 score:  0.4303346408609566 

Decision Tree Classifier - Static Embeddings
------------------------

Binary Classification Report (Sexist vs. Non-Sexist)
F1 score:  0.5797665369649806 

Decision Tree Classifier - Static Embeddings
------------------------

Multiclass Classification Report (DIRECT, REPORTED, JUDGEMENTAL)
F1 score:  0.368358991108051 

Random Forest Classifier - Traditional
---------