In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from numpy import mean
from numpy import std
import pandas as pd
import numpy as np
import pickle

In [37]:
# load data
dataset = pd.read_csv('datasets/BUG/balanced_BUG.csv')

# Randomly select 10,000 samples from your dataset
subset_dataset = dataset.sample(n=15000, random_state=42)

train, test = train_test_split(subset_dataset, test_size=0.2, random_state=42)

train = train[train['stereotype'].isin([0, 1, -1])]
test = test[test['stereotype'].isin([0, 1, -1])]

countV = CountVectorizer()
trainTexts = countV.fit_transform(train['sentence_text'])
testTexts = countV.transform(test['sentence_text'])

X_train = pd.DataFrame(trainTexts.toarray(), columns=countV.get_feature_names_out())
X_test = pd.DataFrame(testTexts.toarray(), columns=countV.get_feature_names_out())
y_train = train['predicted gender']
y_test = test['predicted gender']

In [38]:
# Function that utilizes cross validation to test accuracy of model
def evaluate_model(model):
    # cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
    scores = cross_val_score(model, X_test, y_test, cv=cv, scoring='accuracy', n_jobs=-1, error_score='raise')
    return scores

Random Forest

In [39]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
scores = evaluate_model(rf)
print('Score: {:.4f}'.format(scores.mean()))

Score: 0.9713


In [47]:
import pickle
from sklearn.metrics import accuracy_score, confusion_matrix
# load data
dataset = pd.read_csv('datasets/BUG/balanced_BUG.csv')

# Randomly select 10,000 samples from your dataset
subset_dataset = dataset.sample(n=15000, random_state=42)

train, test = train_test_split(subset_dataset, test_size=0.2, random_state=42)

train = train[train['stereotype'].isin([0, 1, -1])]
test = test[test['stereotype'].isin([0, 1, -1])]

countV = CountVectorizer()
trainTexts = countV.fit_transform(train['sentence_text'])
testTexts = countV.transform(test['sentence_text'])

X_train = pd.DataFrame(trainTexts.toarray(), columns=countV.get_feature_names_out())
X_test = pd.DataFrame(testTexts.toarray(), columns=countV.get_feature_names_out())
y_train = train['predicted gender']
y_test = test['predicted gender']

# train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

with open('savedModels/anti_stereotypeRFM.pkl', 'wb') as model_file:
    pickle.dump((model, countV), model_file)

In [48]:
# accuracy
pred = model.predict(X_test)

accuracy = accuracy_score(y_test, pred)
print("Accuracy:", accuracy)

# confusion matrix
cm = confusion_matrix(y_test, pred)
print("Confusion matrix:\n", cm)

Accuracy: 0.976
Confusion matrix:
 [[1492   29]
 [  43 1436]]
