In [1]:
#!pip install lazypredict

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from lazypredict.Supervised import LazyClassifier
import json

def write_json(data, path):
    output_dict = {
    'predictions': data.to_dict()
    }
    with open(path, "w", encoding="utf-8") as outfile:
        json.dump(output_dict, outfile, indent=4)


In [None]:
root_drive_dir = '/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/data/'

dataset = pd.read_csv(os.path.join(root_drive_dir, "edos_labelled_aggregated.csv"))
dataset = dataset[dataset['label_sexist'] == 'sexist'][["text", "label_category", "split"]]

#### train, dev, test split

In [None]:
train = dataset[dataset['split'] == 'train']
train = train.reset_index(drop=True)
print('Train set shape:', train.shape)

dev = dataset[dataset['split'] == 'dev']
dev = dev.reset_index(drop=True)
print('Validation set shape:', dev.shape)

test = dataset[dataset['split'] == 'test']
test = test.reset_index(drop=True)
print('Test set shape:', test.shape)

Train set shape: (3398, 3)
Validation set shape: (486, 3)
Test set shape: (970, 3)


In [None]:
train.head(2)

Unnamed: 0,text,label_category,split
0,[USER] Leg day is easy. Hot girls who wear min...,3. animosity,train
1,I get a new pussy every other week or whenever...,2. derogation,train


### TFIDF

In [None]:
# Step 3: Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer()
vectorizer.fit(train['text'])
x_train = vectorizer.transform(train['text'])
x_dev = vectorizer.transform(dev['text'])
x_test = vectorizer.transform(test['text'])

y_train = train['label_category']
y_dev = dev['label_category']
y_test = test['label_category']

#### model on dev set

In [None]:
# LazyClassifier
clf = LazyClassifier()
models, dev_predictions = clf.fit(x_train.toarray(), x_dev.toarray(), y_train, y_dev)

 97%|█████████▋| 28/29 [11:24<00:33, 33.54s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017725 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16713
[LightGBM] [Info] Number of data points in the train set: 3398, number of used features: 462
[LightGBM] [Info] Start training from score -2.394370
[LightGBM] [Info] Start training from score -0.759453
[LightGBM] [Info] Start training from score -1.070466
[LightGBM] [Info] Start training from score -2.322800


100%|██████████| 29/29 [11:31<00:00, 23.85s/it]


In [None]:
# save results in json file
json_file_path = "TaskB_TFIDF_lazyclassifier_dev.json"

write_json(dev_predictions, json_file_path)

#### model on test set

In [None]:
# LazyClassifier
clf = LazyClassifier()
models, test_predictions = clf.fit(x_train.toarray(), x_test.toarray(), y_train, y_test)

 97%|█████████▋| 28/29 [12:13<00:39, 39.43s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016981 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16713
[LightGBM] [Info] Number of data points in the train set: 3398, number of used features: 462
[LightGBM] [Info] Start training from score -2.394370
[LightGBM] [Info] Start training from score -0.759453
[LightGBM] [Info] Start training from score -1.070466
[LightGBM] [Info] Start training from score -2.322800


100%|██████████| 29/29 [12:21<00:00, 25.58s/it]


In [None]:
# save results in json file
json_file_path = "TaskB_TFIDF_lazyclassifier_test.json"

write_json(test_predictions, json_file_path)