In [1]:
#!pip install lazypredict

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from lazypredict.Supervised import LazyClassifier
import json

def write_json(data, path):
    output_dict = {
    'predictions': data.to_dict()
    }
    with open(path, "w", encoding="utf-8") as outfile:
        json.dump(output_dict, outfile, indent=4)


In [4]:
root_drive_dir = '/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/data/'

dataset = pd.read_csv(os.path.join(root_drive_dir, "edos_labelled_aggregated.csv"))
dataset = dataset[dataset['label_sexist'] == 'sexist'][["text", "label_category", "label_vector", "split"]]

In [5]:
dataset.shape

(4854, 4)

In [7]:
dataset['label_vector'].value_counts()

2.1 descriptive attacks                                            1024
2.2 aggressive and emotive attacks                                  961
3.1 casual use of gendered slurs, profanities, and insults          910
3.2 immutable gender differences and gender stereotypes             596
4.2 supporting systemic discrimination against women as a group     368
1.2 incitement and encouragement of harm                            363
2.3 dehumanising attacks & overt sexual objectification             286
4.1 supporting mistreatment of individual women                     107
3.3 backhanded gendered compliments                                  91
1.1 threats of harm                                                  80
3.4 condescending explanations or unwelcome advice                   68
Name: label_vector, dtype: int64

#### train, dev, test split

In [8]:
train = dataset[dataset['split'] == 'train']
train = train.reset_index(drop=True)
print('Train set shape:', train.shape)

dev = dataset[dataset['split'] == 'dev']
dev = dev.reset_index(drop=True)
print('Validation set shape:', dev.shape)

test = dataset[dataset['split'] == 'test']
test = test.reset_index(drop=True)
print('Test set shape:', test.shape)

Train set shape: (3398, 4)
Validation set shape: (486, 4)
Test set shape: (970, 4)


### TFIDF

In [9]:
# Step 3: Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer()
vectorizer.fit(train['text'])
x_train = vectorizer.transform(train['text'])
x_dev = vectorizer.transform(dev['text'])
x_test = vectorizer.transform(test['text'])

y_train = train['label_vector']
y_dev = dev['label_vector']
y_test = test['label_vector']

#### model on dev set

In [10]:
# LazyClassifier
clf = LazyClassifier()
models, dev_predictions = clf.fit(x_train.toarray(), x_dev.toarray(), y_train, y_dev)

 97%|█████████▋| 28/29 [15:18<00:42, 42.40s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16713
[LightGBM] [Info] Number of data points in the train set: 3398, number of used features: 462
[LightGBM] [Info] Start training from score -4.105591
[LightGBM] [Info] Start training from score -2.593608
[LightGBM] [Info] Start training from score -1.555866
[LightGBM] [Info] Start training from score -1.619197
[LightGBM] [Info] Start training from score -2.832625
[LightGBM] [Info] Start training from score -1.674173
[LightGBM] [Info] Start training from score -2.097856
[LightGBM] [Info] Start training from score -3.972059
[LightGBM] [Info] Start training from score -4.280795
[LightGBM] [Info] Start training from score -3.813454
[LightGBM] [Info] Start training from score -2.577983


100%|██████████| 29/29 [15:31<00:00, 32.12s/it]


In [14]:
# save results in json file
json_file_path = "/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/01_TFIDF_lazyclassifier/TaskC_TFIDF_lazyclassifier_dev.json"

write_json(dev_predictions, json_file_path)

#### model on test set

In [12]:
# LazyClassifier
clf = LazyClassifier()
models, test_predictions = clf.fit(x_train.toarray(), x_test.toarray(), y_train, y_test)

 97%|█████████▋| 28/29 [16:03<00:45, 45.50s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16713
[LightGBM] [Info] Number of data points in the train set: 3398, number of used features: 462
[LightGBM] [Info] Start training from score -4.105591
[LightGBM] [Info] Start training from score -2.593608
[LightGBM] [Info] Start training from score -1.555866
[LightGBM] [Info] Start training from score -1.619197
[LightGBM] [Info] Start training from score -2.832625
[LightGBM] [Info] Start training from score -1.674173
[LightGBM] [Info] Start training from score -2.097856
[LightGBM] [Info] Start training from score -3.972059
[LightGBM] [Info] Start training from score -4.280795
[LightGBM] [Info] Start training from score -3.813454
[LightGBM] [Info] Start training from score -2.577983


100%|██████████| 29/29 [16:15<00:00, 33.65s/it]


In [15]:
# save results in json file
json_file_path = "/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/01_TFIDF_lazyclassifier/TaskC_TFIDF_lazyclassifier_test.json"

write_json(test_predictions, json_file_path)