In [1]:
%pip install datasets
from datasets import load_dataset


[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
import pandas as pd
import numpy as np

In [3]:
# load dataset
dataset = load_dataset('md_gender_bias', 'convai2_inferred', split='train')

Split train & test sets

In [4]:
# Splitting train : test to 90 : 10 ratio
ds = dataset.train_test_split(test_size=0.1)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'binary_label', 'binary_score', 'ternary_label', 'ternary_score'],
        num_rows: 118294
    })
    test: Dataset({
        features: ['text', 'binary_label', 'binary_score', 'ternary_label', 'ternary_score'],
        num_rows: 13144
    })
})

In [5]:
countV = CountVectorizer(token_pattern=r"(?u)\b\w+\b")

cv_X_train = countV.fit_transform(ds['train']['text'])
y_train = np.ravel(ds['train']['binary_label'])
cv_X_test = countV.fit_transform(ds['test']['text'])
y_test = np.ravel(ds['test']['binary_label'])
# y_test = pd.DataFrame(ds['test']['binary_label']).iloc[:,:].values

pd_tr = pd.DataFrame(data = cv_X_train.toarray())
pd_tst = pd.DataFrame(data = cv_X_test.toarray())

In [6]:
X_train = pd_tr.iloc[:,:].values
X_test = pd_tst.iloc[:,:].values
# cv_X_train.shape

Evaluate classifiers

In [7]:
# Function that utilizes cross validation to test accuracy of model
def evaluate_model(model):
    # cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)
    scores = cross_val_score(model, X_test, y_test, cv=cv, scoring='accuracy', n_jobs=-1, error_score='raise')
    return scores

In [8]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
scores = evaluate_model(rf)
print('Score: {:.4f}'.format(scores.mean()))

Score: 0.7721


In [8]:
# define the pipeline
steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]
model1 = Pipeline(steps=steps)

# evaluate model
n_scores1 = evaluate_model(model1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores1), std(n_scores1)))

Accuracy: 0.613 (0.006)


Chat-GPT prompts

In [9]:
X_samples= ['''In the bustling halls of his college, a young man navigated the complexities of academia, 
            friendships, and love, discovering his passion for astronomy while forging lifelong connections 
            under the starlit campus nights.''',
            '''At the forefront of environmental activism, a passionate college girl spearheaded a movement 
            for sustainability, rallying her peers to embrace eco-friendly practices and leaving an enduring 
            green legacy on the university campus''',
            '''Amid the chaos of exams and late-night study sessions, a college guy found unexpected 
            inspiration in a quirky poetry club, where he unearthed his hidden talent for weaving words 
            and discovered the transformative power of self-expression.''',
            '''Juggling lectures, part-time work, and a secret flair for dance, a college guy discovered 
            the joy of breaking societal expectations and embracing his love for rhythm in the unlikeliest places''',
            '''Fueled by caffeine and dreams, a college male embarked on a coding marathon, racing against deadlines 
            and debugging errors, only to realize that the true beauty lay not in perfection but in the process of 
            creation''',
            '''Navigating the complexities of relationships and self-discovery, a young woman in college learned 
            the art of balancing vulnerability and strength, discovering that love was not a distraction but an 
            integral part of personal growth''',
            '''In the heart of campus activism, a socially conscious college guy led a passionate movement, 
            challenging the status quo and igniting conversations that echoed beyond lecture halls, leaving 
            an indelible mark on the institution''',
            '''Navigating the whirlwind of college relationships, a young man learned the delicate dance of 
            vulnerability and trust, discovering that love's lessons often unfolded in unexpected moments 
            of connection and understanding''',
            '''Battling imposter syndrome and academic pressures, a college male found solace and empowerment in 
            a supportive mentorship program, where guidance and camaraderie transformed his doubts into 
            unwavering self-confidence''',
            '''From quiet study sessions to the loud cheers of the basketball court, a college guy embraced 
            the duality of his passions, discovering that both the pursuit of knowledge and the thrill of 
            competition were essential components of his identity''',
            '''Battling imposter syndrome and academic challenges, a college female sought guidance in a mentorship 
            program, where the wisdom of experienced women empowered her to overcome obstacles and embrace her own 
            capabilities''',
            '''In the realm of campus journalism, a determined college male delved into investigative reporting, 
            unearthing hidden truths and exposing corruption, challenging the notion that the pen was not, indeed, 
            mightier than the sword''',
            '''Faced with the crossroads of post-graduation uncertainty, a college guy embarked on a solo 
            backpacking journey, traversing landscapes both external and internal, finding unexpected clarity 
            and purpose in the uncharted territories of self-discovery''',
            '''Balancing lectures and a part-time job, a college girl discovered her love for urban gardening, 
            cultivating not just plants but also a sense of tranquility amidst the bustling campus''',
            '''Fueled by curiosity and countless cups of tea, a college female delved into the world of 
            ancient history, unraveling forgotten tales and finding parallels that connected her to the roots of 
            civilizations''',
            '''A tenacious college woman, armed with a camera and a passion for storytelling, joined the ranks of 
            campus journalism, unearthing hidden narratives and giving voice to the marginalized, proving that 
            the pursuit of truth could be a powerful force for change''',
            '''Confronting post-graduation uncertainties with resilience, a college female embarked on a solo 
            backpacking adventure, navigating both physical landscapes and the landscapes of her own ambitions, 
            discovering that the journey of self-discovery knows no gender''',
            '''From late-night coding sessions to the spotlight on the theater stage, a college girl embraced the 
            diversity of her interests, realizing that the fusion of logic and creativity was the key to 
            unlocking her full potential''']
y_cgpt = [1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0] # 1 is male; 0 is female

cv_cgpt = countV.transform(X_samples)
pd_cgpt = pd.DataFrame(data = cv_cgpt.toarray())
X_cgpt = pd_cgpt.iloc[:,:].values

steps_cgpt = [('pca', PCA(n_components=5)), ('m', LogisticRegression())]
model2 = Pipeline(steps=steps_cgpt)

cv2 = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)
scores2 = cross_val_score(model2, X_cgpt, y_cgpt, cv=cv2, scoring='accuracy', n_jobs=-1, error_score='raise')

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores2), std(scores2)))

Accuracy: 0.778 (0.111)


In [10]:
# features: ['text', 'binary_label', 'binary_score', 'ternary_label', 'ternary_score']
# test_samples: ['text', 0, 0.85, 0, 0.66] -> scores and labels are approximate values
test_sample = ['''Determined to bridge the gap in STEM fields, a college woman immersed herself in robotics 
             and artificial intelligence, breaking barriers and inspiring the next generation of female 
             engineers with her innovative projects and unwavering passion''']

cv_test_cgpt = countV.transform(X_samples)
# cv_test_cgpt = countV.transform(test_sample)
pd_test_cgpt = pd.DataFrame(data = cv_test_cgpt.toarray())
test_cgpt = pd_test_cgpt.iloc[:,:].values

# add filler columns so that test_cgpt will have the same number of columns as X_train
test_cgpt = pd.DataFrame(test_cgpt).reindex(labels=pd_tr.columns,axis=1,fill_value=0)
test_cgpt = test_cgpt.iloc[:,:].values

# pd.get_dummies
test_cgpt.shape
test_cgpt

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
X_train.shape

(118294, 17315)

Training & testing the classifier using Chat-GPT prompts

In [12]:
# Training the classifier
model3 = model2.fit(X_train, y_train)

In [13]:
# model4 = rf.fit(X_train, y_train)

In [14]:
from sklearn.metrics import f1_score

# Testing the classifier
pca_cgpt_pred = model3.predict(test_cgpt)
# rf_cgpt_pred = model4.predict(test_cgpt)

expected = y_cgpt   # for X_samples
# expected = [0]    # for test_sample

print('Predicted PCA:', pca_cgpt_pred)
# print('Predicted Random Forest:', rf_cgpt_pred)
print('Expected:', expected)
# print('F1-score: {:.4f}'.format(f1_score(expected, pca_cgpt_pred, average='macro')))

Predicted PCA: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Expected: [1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0]
