In [None]:
#!pip install lazypredict
#!pip install -U sentence-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from lazypredict.Supervised import LazyClassifier
import json

def write_json(data, path):
    """
            Write json file
    Args:
        param data
        param path to save json file
    Return:
        json file
    """
    output_dict = {
    'predictions': data.to_dict()
    }
    with open(path, "w", encoding="utf-8") as outfile:
        json.dump(output_dict, outfile, indent=4)


In [None]:
root_drive_dir = '/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/data/'

dataset = pd.read_csv(os.path.join(root_drive_dir, "edos_labelled_aggregated.csv"))[["text", "label_sexist", "split"]]

### having a look at data

In [None]:
dataset.head(2)

Unnamed: 0,text,label_sexist,split
0,"In Nigeria, if you rape a woman, the men rape ...",not sexist,dev
1,"Then, she's a keeper. 😉",not sexist,train


In [None]:
print(dataset.shape)

(20000, 3)


#### train, dev, test split

In [None]:
dataset['split'].value_counts()

train    14000
test      4000
dev       2000
Name: split, dtype: int64

In [None]:
train = dataset[dataset['split'] == 'train']
train = train.reset_index(drop=True)
print('Train set shape:', train.shape)

dev = dataset[dataset['split'] == 'dev']
dev = dev.reset_index(drop=True)
print('Validation set shape:', dev.shape)

test = dataset[dataset['split'] == 'test']
test = test.reset_index(drop=True)
print('Test set shape:', test.shape)


Train set shape: (14000, 3)
Validation set shape: (2000, 3)
Test set shape: (4000, 3)


#### detect missing values

In [None]:
train.isnull().sum()

text            0
label_sexist    0
split           0
dtype: int64

In [None]:
dev.isnull().sum()

text            0
label_sexist    0
split           0
dtype: int64

In [None]:
test.isnull().sum()

text            0
label_sexist    0
split           0
dtype: int64

#### detect duplicate rows

In [None]:
train_duplicated = train[train.duplicated()]
dev_duplicated = dev[dev.duplicated()]
test_duplicated = test[test.duplicated()]

In [None]:
print(len(train_duplicated))
print(len(dev_duplicated))
print(len(test_duplicated))

0
0
0


.

In [None]:
train['label_sexist'].value_counts()

not sexist    10602
sexist         3398
Name: label_sexist, dtype: int64

### load Sentence Transformer

In [None]:
model = SentenceTransformer("all-roberta-large-v1")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
# Function to generate sentence embeddings
def generate_embeddings(dataframe):
    embeddings = []
    for sentence in dataframe['text']:
        embeddings.append(model.encode(sentence))
    return embeddings

# Generate embeddings for train and test data
train_embeddings = generate_embeddings(train)
dev_embeddings = generate_embeddings(dev)
test_embeddings = generate_embeddings(test)


# Convert the lists of embeddings back to DataFrames
train_features = pd.DataFrame(train_embeddings)
dev_features = pd.DataFrame(dev_embeddings)
test_features = pd.DataFrame(test_embeddings)

In [None]:
train_labels = train['label_sexist']
dev_labels = dev['label_sexist']
test_labels = test['label_sexist']


#### model on validation set

In [None]:
# Use LazyClassifier
clf = LazyClassifier()
models, dev_predictions = clf.fit(train_features, dev_features, train_labels, dev_labels)

# Display the results
print(models)

 97%|█████████▋| 28/29 [15:05<00:22, 22.28s/it]

[LightGBM] [Info] Number of positive: 3398, number of negative: 10602
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.244699 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 14000, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242714 -> initscore=-1.137856
[LightGBM] [Info] Start training from score -1.137856


100%|██████████| 29/29 [15:33<00:00, 32.20s/it]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
Perceptron                         0.79               0.72    None      0.79   
LogisticRegression                 0.82               0.72    None      0.81   
SVC                                0.84               0.71    None      0.82   
LinearDiscriminantAnalysis         0.82               0.71    None      0.81   
SGDClassifier                      0.79               0.71    None      0.79   
RidgeClassifier                    0.83               0.71    None      0.81   
RidgeClassifierCV                  0.83               0.70    None      0.81   
NearestCentroid                    0.71               0.69    None      0.72   
GaussianNB                         0.71               0.69    None      0.73   
PassiveAggressiveClassifier        0.78               0.69    None      0.78   
BernoulliNB                        0.71 




In [None]:
# save results in json file
json_file_path = "TaskA_roberta_lazyclassifier_dev.json"

write_json(dev_predictions, json_file_path)

#### model on test set

In [None]:
# Use LazyClassifier
clf = LazyClassifier()
models, test_predictions = clf.fit(train_features, test_features, train_labels, test_labels)

# Display the results
print(models)

 97%|█████████▋| 28/29 [15:13<00:24, 24.03s/it]

[LightGBM] [Info] Number of positive: 3398, number of negative: 10602
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.243058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 14000, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242714 -> initscore=-1.137856
[LightGBM] [Info] Start training from score -1.137856


100%|██████████| 29/29 [15:40<00:00, 32.43s/it]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
LinearDiscriminantAnalysis         0.83               0.73    None      0.82   
LogisticRegression                 0.82               0.73    None      0.82   
SGDClassifier                      0.79               0.72    None      0.79   
Perceptron                         0.78               0.72    None      0.78   
RidgeClassifier                    0.83               0.71    None      0.82   
RidgeClassifierCV                  0.83               0.71    None      0.82   
GaussianNB                         0.72               0.71    None      0.74   
SVC                                0.84               0.71    None      0.82   
NearestCentroid                    0.71               0.71    None      0.73   
LinearSVC                          0.79               0.70    None      0.79   
BernoulliNB                        0.72 




In [None]:
# save results in json file
json_file_path = "TaskA_roberta_lazyclassifier_test.json"

write_json(test_predictions, json_file_path)