In [None]:
#!pip install lazypredict
#!pip install -U sentence-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from lazypredict.Supervised import LazyClassifier
import json

def write_json(data, path):
    output_dict = {
    'predictions': data.to_dict()
    }
    with open(path, "w", encoding="utf-8") as outfile:
        json.dump(output_dict, outfile, indent=4)


In [None]:
root_drive_dir = '/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/data/'

dataset = pd.read_csv(os.path.join(root_drive_dir, "edos_labelled_aggregated.csv"))
dataset = dataset[dataset['label_sexist'] == 'sexist'][["text", "label_category", "split"]]

#### train, dev, test split

In [None]:
train = dataset[dataset['split'] == 'train']
train = train.reset_index(drop=True)
print('Train set shape:', train.shape)

dev = dataset[dataset['split'] == 'dev']
dev = dev.reset_index(drop=True)
print('Validation set shape:', dev.shape)

test = dataset[dataset['split'] == 'test']
test = test.reset_index(drop=True)
print('Test set shape:', test.shape)

Train set shape: (3398, 3)
Validation set shape: (486, 3)
Test set shape: (970, 3)


### load Sentence Transformer

In [None]:
model = SentenceTransformer("all-roberta-large-v1")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
# Function to generate sentence embeddings
def generate_embeddings(dataframe):
    embeddings = []
    for sentence in dataframe['text']:
        embeddings.append(model.encode(sentence))
    return embeddings

# Generate embeddings for train and test data
train_embeddings = generate_embeddings(train)
dev_embeddings = generate_embeddings(dev)
test_embeddings = generate_embeddings(test)


# Convert the lists of embeddings back to DataFrames
train_features = pd.DataFrame(train_embeddings)
dev_features = pd.DataFrame(dev_embeddings)
test_features = pd.DataFrame(test_embeddings)

train_labels = train['label_category']
dev_labels = dev['label_category']
test_labels = test['label_category']

#### model on dev set

In [None]:
# LazyClassifier
clf = LazyClassifier()
models, dev_predictions = clf.fit(train_features, dev_features, train_labels, dev_labels)

 97%|█████████▋| 28/29 [03:27<00:02,  2.59s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.095699 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 3398, number of used features: 1024
[LightGBM] [Info] Start training from score -2.394370
[LightGBM] [Info] Start training from score -0.759453
[LightGBM] [Info] Start training from score -1.070466
[LightGBM] [Info] Start training from score -2.322800


100%|██████████| 29/29 [04:31<00:00,  9.36s/it]


In [None]:
# save results in json file
json_file_path = "TaskB_roberta_lazyclassifier_dev.json"

write_json(dev_predictions, json_file_path)

#### model on test set

In [None]:
# LazyClassifier
clf = LazyClassifier()
models, test_predictions = clf.fit(train_features, test_features, train_labels, test_labels)

 90%|████████▉ | 26/29 [03:24<00:15,  5.07s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044177 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 3398, number of used features: 1024
[LightGBM] [Info] Start training from score -2.394370
[LightGBM] [Info] Start training from score -0.759453
[LightGBM] [Info] Start training from score -1.070466
[LightGBM] [Info] Start training from score -2.322800


100%|██████████| 29/29 [04:28<00:00,  9.27s/it]


In [None]:
# save results in json file
json_file_path = "TaskB_roberta_lazyclassifier_test.json"

write_json(test_predictions, json_file_path)