In [14]:
#!pip install lazypredict
#!pip install -U sentence-transformers

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from lazypredict.Supervised import LazyClassifier
import json

def write_json(data, path):
    output_dict = {
    'predictions': data.to_dict()
    }
    with open(path, "w", encoding="utf-8") as outfile:
        json.dump(output_dict, outfile, indent=4)


In [2]:
root_drive_dir = '/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/data/'

dataset = pd.read_csv(os.path.join(root_drive_dir, "edos_labelled_aggregated.csv"))
dataset = dataset[dataset['label_sexist'] == 'sexist'][["text", "label_category", "label_vector", "split"]]

In [3]:
dataset.shape

(4854, 4)

#### train, dev, test split

In [4]:
train = dataset[dataset['split'] == 'train']
train = train.reset_index(drop=True)
print('Train set shape:', train.shape)

dev = dataset[dataset['split'] == 'dev']
dev = dev.reset_index(drop=True)
print('Validation set shape:', dev.shape)

test = dataset[dataset['split'] == 'test']
test = test.reset_index(drop=True)
print('Test set shape:', test.shape)

Train set shape: (3398, 4)
Validation set shape: (486, 4)
Test set shape: (970, 4)


### load Sentence Transformer

In [5]:
model = SentenceTransformer('cross-encoder/nli-deberta-base')

# Function to generate sentence embeddings
def generate_embeddings(dataframe):
    embeddings = []
    for sentence in dataframe['text']:
        embeddings.append(model.encode(sentence))
    return embeddings



In [6]:
# Generate embeddings for train and test data
train_embeddings = generate_embeddings(train)
dev_embeddings = generate_embeddings(dev)
test_embeddings = generate_embeddings(test)


# Convert the lists of embeddings back to DataFrames
train_features = pd.DataFrame(train_embeddings)
dev_features = pd.DataFrame(dev_embeddings)
test_features = pd.DataFrame(test_embeddings)

train_labels = train['label_vector']
dev_labels = dev['label_vector']
test_labels = test['label_vector']

#### model on dev set

In [7]:
# LazyClassifier
clf = LazyClassifier()
models, dev_predictions = clf.fit(train_features, dev_features, train_labels, dev_labels)

 90%|████████▉ | 26/29 [03:32<00:15,  5.11s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 3398, number of used features: 768
[LightGBM] [Info] Start training from score -4.105591
[LightGBM] [Info] Start training from score -2.593608
[LightGBM] [Info] Start training from score -1.555866
[LightGBM] [Info] Start training from score -1.619197
[LightGBM] [Info] Start training from score -2.832625
[LightGBM] [Info] Start training from score -1.674173
[LightGBM] [Info] Start training from score -2.097856
[LightGBM] [Info] Start training from score -3.972059
[LightGBM] [Info] Start training from score -4.280795
[LightGBM] [Info] Start training from score -3.813454
[LightGBM] [Info] Start training from score -2.577983


100%|██████████| 29/29 [05:28<00:00, 11.33s/it]


In [8]:
# save results in json file
json_file_path = "/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/03_ST(deberta)_lazyclassifier/TaskC_deberta_lazyclassifier_dev.json"

write_json(dev_predictions, json_file_path)

#### model on test set

In [9]:
clf = LazyClassifier()
models, test_predictions = clf.fit(train_features, test_features, train_labels, test_labels)

 90%|████████▉ | 26/29 [03:29<00:16,  5.41s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 3398, number of used features: 768
[LightGBM] [Info] Start training from score -4.105591
[LightGBM] [Info] Start training from score -2.593608
[LightGBM] [Info] Start training from score -1.555866
[LightGBM] [Info] Start training from score -1.619197
[LightGBM] [Info] Start training from score -2.832625
[LightGBM] [Info] Start training from score -1.674173
[LightGBM] [Info] Start training from score -2.097856
[LightGBM] [Info] Start training from score -3.972059
[LightGBM] [Info] Start training from score -4.280795
[LightGBM] [Info] Start training from score -3.813454
[LightGBM] [Info] Start training from score -2.577983


100%|██████████| 29/29 [05:23<00:00, 11.14s/it]


In [10]:
json_file_path = "/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/03_ST(deberta)_lazyclassifier/TaskC_deberta_lazyclassifier_test.json"

write_json(test_predictions, json_file_path)