In [None]:
#!pip install lazypredict
#!pip install -U sentence-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from lazypredict.Supervised import LazyClassifier
import json

def write_json(data, path):
    output_dict = {
    'predictions': data.to_dict()
    }
    with open(path, "w", encoding="utf-8") as outfile:
        json.dump(output_dict, outfile, indent=4)


In [None]:
root_drive_dir = '../dataset/'

dataset = pd.read_csv(os.path.join(root_drive_dir, "edos_labelled_aggregated.csv"))[["text", "label_sexist", "split"]]

#### train, dev, test split

In [None]:
train = dataset[dataset['split'] == 'train']
train = train.reset_index(drop=True)
print('Train set shape:', train.shape)

dev = dataset[dataset['split'] == 'dev']
dev = dev.reset_index(drop=True)
print('Validation set shape:', dev.shape)

test = dataset[dataset['split'] == 'test']
test = test.reset_index(drop=True)
print('Test set shape:', test.shape)

Train set shape: (14000, 3)
Validation set shape: (2000, 3)
Test set shape: (4000, 3)


#### pick 100 random samples per class

In [None]:
# Train
# Randomly select 100 samples from the sexist class
train_sexist_samples = train[train['label_sexist'] == 'sexist']#.sample(n=100, random_state=42)
# Randomly select 100 samples from the not sexist class
train_not_sexist_samples = train[train['label_sexist'] == 'not sexist']#.sample(n=100, random_state=42)
# Combine the selected samples into a single DataFrame
train_selected_samples = pd.concat([train_sexist_samples, train_not_sexist_samples])
# If you want to shuffle the combined DataFrame
train_selected_samples = train_selected_samples.sample(frac=1, random_state=42).reset_index(drop=True)

# Validation
# Randomly select 100 samples from the sexist class
dev_sexist_samples = dev[dev['label_sexist'] == 'sexist']#.sample(n=100, random_state=42)
# Randomly select 100 samples from the not sexist class
dev_not_sexist_samples = dev[dev['label_sexist'] == 'not sexist']#.sample(n=100, random_state=42)
dev_selected_samples = pd.concat([dev_sexist_samples, dev_not_sexist_samples])
dev_selected_samples = dev_selected_samples.sample(frac=1, random_state=42).reset_index(drop=True)

# Test
# Randomly select 100 samples from the sexist class
test_sexist_samples = test[test['label_sexist'] == 'sexist']#.sample(n=100, random_state=42)
# Randomly select 100 samples from the not sexist class
test_not_sexist_samples = test[test['label_sexist'] == 'not sexist']#.sample(n=100, random_state=42)
test_selected_samples = pd.concat([test_sexist_samples, test_not_sexist_samples])
test_selected_samples = test_selected_samples.sample(frac=1, random_state=42).reset_index(drop=True)

### load Sentence Transformer

In [None]:
model = SentenceTransformer('cross-encoder/nli-deberta-base')



In [None]:
# Function to generate sentsence embeddings
def generate_embeddings(dataframe):
    embeddings = []
    for sentence in dataframe['text']:
        embeddings.append(model.encode(sentence))
    return embeddings

# Generate embeddings for train and test data
train_embeddings = generate_embeddings(train_selected_samples)
dev_embeddings = generate_embeddings(dev_selected_samples)
test_embeddings = generate_embeddings(test_selected_samples)


# Convert the lists of embeddings back to DataFrames
train_features_200 = pd.DataFrame(train_embeddings)
dev_features_200 = pd.DataFrame(dev_embeddings)
test_features_200 = pd.DataFrame(test_embeddings)

train_labels_200 = train_selected_samples['label_sexist']
dev_labels_200 = dev_selected_samples['label_sexist']
test_labels_200 = test_selected_samples['label_sexist']

#### model on dev set

200 samples

In [None]:
# LazyClassifier
clf = LazyClassifier()
models, dev_200samples_predictions = clf.fit(train_features_200, dev_features_200, train_labels_200, dev_labels_200)

 90%|████████▉ | 26/29 [00:04<00:00,  7.31it/s]

[LightGBM] [Info] Number of positive: 100, number of negative: 100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002492 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 52490
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


100%|██████████| 29/29 [00:05<00:00,  4.99it/s]


In [None]:
# save results in json file
json_file_path = "TaskA_deberta_lazyclassifier_dev.json"

write_json(dev_200samples_predictions, json_file_path)

#### model on test set

200 samples

In [None]:
# LazyClassifier
clf = LazyClassifier()
models, test_200samples_predictions = clf.fit(train_features_200, test_features_200, train_labels_200, test_labels_200)

 90%|████████▉ | 26/29 [00:04<00:00,  7.25it/s]

[LightGBM] [Info] Number of positive: 100, number of negative: 100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002342 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 52490
[LightGBM] [Info] Number of data points in the train set: 200, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


100%|██████████| 29/29 [00:05<00:00,  5.42it/s]






In [None]:
# save results in json file
json_file_path = "TaskA_deberta_lazyclassifier_test.json"

write_json(test_200samples_predictions, json_file_path)