In [2]:
!pip install lazypredict

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from lazypredict.Supervised import LazyClassifier
import json

def write_json(data, path):
    output_dict = {
    'predictions': data.to_dict()
    }
    with open(path, "w", encoding="utf-8") as outfile:
        json.dump(output_dict, outfile, indent=4)


In [None]:
root_drive_dir = '/content/drive/My Drive/Colab Notebooks/DataHub Projects/CrispyWork/edos/data/'

dataset = pd.read_csv(os.path.join(root_drive_dir, "edos_labelled_aggregated.csv"))[["text", "label_sexist", "split"]]

#### train, dev, test split

In [None]:
train = dataset[dataset['split'] == 'train']
train = train.reset_index(drop=True)
print('Train set shape:', train.shape)

dev = dataset[dataset['split'] == 'dev']
dev = dev.reset_index(drop=True)
print('Validation set shape:', dev.shape)

test = dataset[dataset['split'] == 'test']
test = test.reset_index(drop=True)
print('Test set shape:', test.shape)

Train set shape: (14000, 3)
Validation set shape: (2000, 3)
Test set shape: (4000, 3)


#### pick 100 random samples per class

In [None]:
# Train
# Randomly select 100 samples from the sexist class
train_sexist_samples = train[train['label_sexist'] == 'sexist']#.sample(n=100, random_state=42)
# Randomly select 100 samples from the not sexist class
train_not_sexist_samples = train[train['label_sexist'] == 'not sexist']#.sample(n=100, random_state=42)
# Combine the selected samples into a single DataFrame
train_selected_samples = pd.concat([train_sexist_samples, train_not_sexist_samples])
# If you want to shuffle the combined DataFrame
train_selected_samples = train_selected_samples.sample(frac=1, random_state=42).reset_index(drop=True)

# Validation
# Randomly select 100 samples from the sexist class
dev_sexist_samples = dev[dev['label_sexist'] == 'sexist']#.sample(n=100, random_state=42)
# Randomly select 100 samples from the not sexist class
dev_not_sexist_samples = dev[dev['label_sexist'] == 'not sexist']#.sample(n=100, random_state=42)
dev_selected_samples = pd.concat([dev_sexist_samples, dev_not_sexist_samples])
dev_selected_samples = dev_selected_samples.sample(frac=1, random_state=42).reset_index(drop=True)

# Test
# Randomly select 100 samples from the sexist class
test_sexist_samples = test[test['label_sexist'] == 'sexist']#.sample(n=100, random_state=42)
# Randomly select 100 samples from the not sexist class
test_not_sexist_samples = test[test['label_sexist'] == 'not sexist']#.sample(n=100, random_state=42)
test_selected_samples = pd.concat([test_sexist_samples, test_not_sexist_samples])
test_selected_samples = test_selected_samples.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
train.head(2)

Unnamed: 0,text,label_sexist,split
0,"Then, she's a keeper. 😉",not sexist,train
1,This is like the Metallica video where the poo...,not sexist,train


### TFIDF

In [None]:
# Step 3: Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer()
vectorizer.fit(train_selected_samples['text'])
x_train_200 = vectorizer.transform(train_selected_samples['text'])
x_dev_200 = vectorizer.transform(dev_selected_samples['text'])
x_test_200 = vectorizer.transform(test_selected_samples['text'])

y_train_200 = train_selected_samples['label_sexist']
y_dev_200 = dev_selected_samples['label_sexist']
y_test_200 = test_selected_samples['label_sexist']

#### model on dev set

In [None]:
from sklearn.utils import all_estimators
from sklearn.base import ClassifierMixin

removed_classifiers = [
"ClassifierChain",
"ComplementNB",
"GradientBoostingClassifier",
"GaussianProcessClassifier",
"HistGradientBoostingClassifier",
"MLPClassifier",
"LogisticRegressionCV",
"MultiOutputClassifier",
"MultinomialNB",
"OneVsOneClassifier",
"OneVsRestClassifier",
"OutputCodeClassifier",
"RadiusNeighborsClassifier",
"VotingClassifier",
'SVC','LabelPropagation','LabelSpreading','NuSV']

classifiers_list = [est for est in all_estimators() if (issubclass(est[1], ClassifierMixin) and (est[0] not in removed_classifiers))]



In [None]:
# LazyClassifier
clf = LazyClassifier(verbose =1, ignore_warnings=False, classifiers=classifiers_list)

models, dev_200samples_predictions = clf.fit(x_train_200.toarray(), x_dev_200.toarray(), y_train_200, y_dev_200)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


  4%|▍         | 1/24 [07:25<2:50:39, 445.20s/it]

ROC AUC couldn't be calculated for AdaBoostClassifier
could not convert string to float: 'not sexist'
{'Model': 'AdaBoostClassifier', 'Accuracy': 0.8315, 'Balanced Accuracy': 0.6784401824398889, 'ROC AUC': None, 'F1 Score': 0.8067010576204034, 'Time taken': 445.2003753185272}


In [None]:
# save results in json file
json_file_path = "TaskA_TFIDF_lazyclassifier_dev.json"

write_json(dev_200samples_predictions, json_file_path)

#### model on test set

In [None]:
# LazyClassifier
clf = LazyClassifier(classifiers=classifiers_list)

models, test_200samples_predictions = clf.fit(x_train_200.toarray(), x_test_200.toarray(), y_train_200, y_test_200)

'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 24/24 [2:37:41<00:00, 394.24s/it]  


In [None]:
# save results in json file
json_file_path = "TaskA_TFIDF_lazyclassifier_test.json"

write_json(test_200samples_predictions, json_file_path)