In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from numpy import mean
from numpy import std
import pandas as pd
import numpy as np
import pickle
from flask import Flask, request, jsonify
from lime.lime_text import LimeTextExplainer
from nltk.stem import PorterStemmer
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [3]:
# preprocessing functions
# # Preprocessing functions
# add space before punctuations

app = Flask(__name__)
explainer = LimeTextExplainer(class_names=['female', 'male'])
stemmer = PorterStemmer()
nlp = spacy.load("en_core_web_sm")

# remove gendered pronounds, names, stop words, and apply stemming
def removeUnnecessaryWords(text):
    doc = nlp(text)

    result = " ".join([
        "" if (
            token.pos_ == "PRON" and token.lemma_ not in ["I", "you"]
        ) or (
            token.ent_type_ == "PERSON" or token.text.lower() in ["woman", "women", "man", "men", "he", "she", "him", "her"]
        ) or (
            token.text.lower() in STOP_WORDS
        ) else stemmer.stem(token.lemma_) for token in doc])

    return result.strip()

In [11]:
# load data
dataset = pd.read_csv('../datasets/BUG/balanced_BUG.csv')
train, test = train_test_split(dataset, test_size=0.2, random_state=42)
# get samples with only neutral or stereotype sentence
train = train[train['stereotype'].isin([0, 1])]
test = test[test['stereotype'].isin([0, 1])]

# apply preprocessing
train['sentence_text'] = train['sentence_text'].apply(removeUnnecessaryWords)
test['sentence_text'] = test['sentence_text'].apply(removeUnnecessaryWords)

countV = TfidfVectorizer()
trainTexts = countV.fit_transform(train['sentence_text'])
testTexts = countV.transform(test['sentence_text'])

X_train = pd.DataFrame(trainTexts.toarray(), columns=countV.get_feature_names_out())
X_test = pd.DataFrame(testTexts.toarray(), columns=countV.get_feature_names_out())
y_train = train['predicted gender']
y_test = test['predicted gender']

In [12]:
# Function that utilizes cross validation to test accuracy of model
def evaluate_model(model):
    # cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
    scores = cross_val_score(model, X_test, y_test, cv=cv, scoring='accuracy', n_jobs=-1, error_score='raise')
    return scores

# Function that utilizes cross validation to test accuracy of model
def evaluate_model_f1(model):
    # cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
    scores = cross_val_score(model, X_test, y_test, cv=cv, scoring='f1_macro', n_jobs=-1, error_score='raise')
    return scores

Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
scores = evaluate_model(rf)
scoresf1 = evaluate_model_f1(rf)
print('Accuracy: {:.4f}'.format(scores.mean()))
print('F1-score: {:.4f}'.format(scoresf1.mean()))

Accuracy: 0.9282
F1-score: 0.9281


Ensemble of Ensembles (EoE)

In [15]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score, accuracy_score

clf1 = DecisionTreeClassifier(max_features=1, random_state=0)
clf2 = BaggingClassifier(max_features=4, random_state=0)
clf3 = RandomForestClassifier(max_features=1, random_state=0)
clf4 = AdaBoostClassifier(n_estimators=50, random_state=0)
eclf1 = VotingClassifier(estimators=[('dt', clf1), ('bdt', clf2), ('rf', clf3), ('ab', clf4)], voting='hard')
eclf1 = eclf1.fit(X_train, y_train)
pred = eclf1.predict(X_test)
print('Accuracy: {:.4f}'.format(accuracy_score(y_test, pred)))
print('F1-score: {:.4f}'.format(f1_score(y_test, pred, average='weighted')))

with open('../savedModels/ensembleOfEnsembles.pkl', 'wb') as model_file:
    pickle.dump((eclf1, countV), model_file)

Accuracy: 0.9052
F1-score: 0.9052


Pipeline PCA

In [16]:
# define the pipeline
steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]
model1 = Pipeline(steps=steps)

# evaluate model
n_scores1 = evaluate_model(model1)
n_scores1f1 = evaluate_model_f1(model1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores1), std(n_scores1)))
print('F1-score: %.3f (%.3f)' % (mean(n_scores1f1), std(n_scores1f1)))

Accuracy: 0.810 (0.028)
F1-score: 0.802 (0.030)


K-Nearest Neighbor (KNN)

In [19]:
from sklearn.neighbors import KNeighborsClassifier

# Applying k = 3, default Minkowski distance metrics
modelknn = KNeighborsClassifier(n_neighbors=4)

# evaluate model
knn_scores1 = evaluate_model(modelknn)
knn_scores1f1 = evaluate_model_f1(modelknn)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(knn_scores1), std(knn_scores1)))
print('F1-score: %.3f (%.3f)' % (mean(knn_scores1f1), std(knn_scores1f1)))

AttributeError: 'Flags' object has no attribute 'c_contiguous'

ChatGPT examples

In [6]:
X_samples= ['''In the bustling halls of his college, a young man navigated the complexities of academia, 
            friendships, and love, discovering his passion for astronomy while forging lifelong connections 
            under the starlit campus nights.''',
            '''At the forefront of environmental activism, a passionate college girl spearheaded a movement 
            for sustainability, rallying her peers to embrace eco-friendly practices and leaving an enduring 
            green legacy on the university campus''',
            '''Amid the chaos of exams and late-night study sessions, a college guy found unexpected 
            inspiration in a quirky poetry club, where he unearthed his hidden talent for weaving words 
            and discovered the transformative power of self-expression.''',
            '''Juggling lectures, part-time work, and a secret flair for dance, a college guy discovered 
            the joy of breaking societal expectations and embracing his love for rhythm in the unlikeliest places''',
            '''Fueled by caffeine and dreams, a college male embarked on a coding marathon, racing against deadlines 
            and debugging errors, only to realize that the true beauty lay not in perfection but in the process of 
            creation''',
            '''Navigating the complexities of relationships and self-discovery, a young woman in college learned 
            the art of balancing vulnerability and strength, discovering that love was not a distraction but an 
            integral part of personal growth''',
            '''In the heart of campus activism, a socially conscious college guy led a passionate movement, 
            challenging the status quo and igniting conversations that echoed beyond lecture halls, leaving 
            an indelible mark on the institution''',
            '''Navigating the whirlwind of college relationships, a young man learned the delicate dance of 
            vulnerability and trust, discovering that love's lessons often unfolded in unexpected moments 
            of connection and understanding''',
            '''Battling imposter syndrome and academic pressures, a college male found solace and empowerment in 
            a supportive mentorship program, where guidance and camaraderie transformed his doubts into 
            unwavering self-confidence''',
            '''From quiet study sessions to the loud cheers of the basketball court, a college guy embraced 
            the duality of his passions, discovering that both the pursuit of knowledge and the thrill of 
            competition were essential components of his identity''',
            '''Battling imposter syndrome and academic challenges, a college female sought guidance in a mentorship 
            program, where the wisdom of experienced women empowered her to overcome obstacles and embrace her own 
            capabilities''',
            '''In the realm of campus journalism, a determined college male delved into investigative reporting, 
            unearthing hidden truths and exposing corruption, challenging the notion that the pen was not, indeed, 
            mightier than the sword''',
            '''Faced with the crossroads of post-graduation uncertainty, a college guy embarked on a solo 
            backpacking journey, traversing landscapes both external and internal, finding unexpected clarity 
            and purpose in the uncharted territories of self-discovery''',
            '''Balancing lectures and a part-time job, a college girl discovered her love for urban gardening, 
            cultivating not just plants but also a sense of tranquility amidst the bustling campus''',
            '''Fueled by curiosity and countless cups of tea, a college female delved into the world of 
            ancient history, unraveling forgotten tales and finding parallels that connected her to the roots of 
            civilizations''',
            '''A tenacious college woman, armed with a camera and a passion for storytelling, joined the ranks of 
            campus journalism, unearthing hidden narratives and giving voice to the marginalized, proving that 
            the pursuit of truth could be a powerful force for change''',
            '''Confronting post-graduation uncertainties with resilience, a college female embarked on a solo 
            backpacking adventure, navigating both physical landscapes and the landscapes of her own ambitions, 
            discovering that the journey of self-discovery knows no gender''',
            '''From late-night coding sessions to the spotlight on the theater stage, a college girl embraced the 
            diversity of her interests, realizing that the fusion of logic and creativity was the key to 
            unlocking her full potential''']
y_cgpt = [1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0] # 1 is male; 0 is female

cv_cgpt = countV.transform(X_samples)
pd_cgpt = pd.DataFrame(data = cv_cgpt.toarray())
X_cgpt = pd_cgpt.iloc[:,:].values

steps_cgpt = [('pca', PCA(n_components=5)), ('m', LogisticRegression())]
model2 = Pipeline(steps=steps_cgpt)

cv2 = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)
scores2 = cross_val_score(model2, X_cgpt, y_cgpt, cv=cv2, scoring='accuracy', n_jobs=-1, error_score='raise')
scores2f1 = cross_val_score(model2, X_cgpt, y_cgpt, cv=cv2, scoring='f1_macro', n_jobs=-1, error_score='raise')

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores2), std(scores2)))
print('F1-score: %.3f (%.3f)' % (mean(scores2f1), std(scores2f1)))

Accuracy: 0.611 (0.056)


In [7]:
test_sample = ['''Determined to bridge the gap in STEM fields, a college woman immersed herself in robotics 
             and artificial intelligence, breaking barriers and inspiring the next generation of female 
             engineers with her innovative projects and unwavering passion''']

cv_test_cgpt = countV.transform(X_samples)
# cv_test_cgpt = countV.transform(test_sample)
pd_test_cgpt = pd.DataFrame(data = cv_test_cgpt.toarray())
test_cgpt = pd_test_cgpt.iloc[:,:].values

pd_tr = pd.DataFrame(data = trainTexts.toarray())

# add filler columns so that test_cgpt will have the same number of columns as X_train
test_cgpt = pd.DataFrame(test_cgpt).reindex(labels=pd_tr.columns,axis=1,fill_value=0)
test_cgpt = test_cgpt.iloc[:,:].values

In [8]:
# Training the classifier
model2 = model1.fit(X_train, y_train)

In [9]:
model3 = rf.fit(X_train, y_train)

In [10]:
# Testing the classifier
pca_cgpt_pred = model2.predict(test_cgpt)
rf_cgpt_pred = model3.predict(test_cgpt)

expected = y_cgpt   # for X_samples
# expected = [0]    # for test_sample

print('Predicted PCA:', pca_cgpt_pred)
print('Predicted Random Forest:', rf_cgpt_pred)
print('Expected:', expected)

Predicted PCA: ['male' 'female' 'male' 'male' 'female' 'female' 'male' 'female' 'male'
 'male' 'female' 'male' 'female' 'female' 'female' 'female' 'female'
 'female']
Predicted Random Forest: ['male' 'female' 'male' 'male' 'female' 'female' 'male' 'female' 'male'
 'male' 'female' 'male' 'female' 'female' 'female' 'male' 'female'
 'female']
Expected: [1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0]




In [12]:
eoe_model_cgpt = eclf1.predict(test_cgpt)

print('Predicted Random Forest:', eoe_model_cgpt)
print('Expected:', expected)

Predicted Random Forest: ['male' 'female' 'female' 'female' 'male' 'female' 'female' 'female'
 'male' 'male' 'female' 'male' 'male' 'female' 'female' 'female' 'female'
 'female']
Expected: [1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0]


