In [35]:
from collections import Counter
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import torch.nn as nn

In [65]:
data = pd.read_csv("aita_processed.csv")
data["gendered"] = "n"
for index,row in data.iterrows():
    if pd.notnull(data.at[index,"age"]):
        if pd.notnull(data.at[index,"gender"]):
            data.at[index,"gendered"] = "y"
            
gendered = data[data["gendered"] == "y"]
not_gendered = data[data["gendered"] == "n"]
del gendered["gendered"]
del not_gendered["gendered"]

In [66]:
y_gendered = gendered["verdict"]
del gendered["verdict"]
X_gendered = gendered

y_not_gendered = not_gendered["verdict"]
del not_gendered["verdict"]
del not_gendered["gender"]
del not_gendered["age"]
X_not_gendered = not_gendered

0                asshole
1                asshole
2        not the asshole
3                asshole
4        not the asshole
              ...       
97620    not the asshole
97622    not the asshole
97625    not the asshole
97626    not the asshole
97627    not the asshole
Name: verdict, Length: 88288, dtype: object

In [73]:
X_not_gendered = X_not_gendered.replace(np.nan, "")
X_not_gendered.isnull().sum().sum()

0


## Over-sampling

In [60]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import SMOTEN

In [63]:
sampler_gendered = SMOTEN(random_state=224)
X_res_gendered, y_res_gendered = sampler_gendered.fit_resample(X_gendered, y_gendered)

In [74]:
sampler_not_gendered = SMOTEN(random_state=224)
X_res_not_gendered, y_res_not_gendered = sampler_not_gendered.fit_resample(X_not_gendered, y_not_gendered)

In [76]:
X_traindev_gendered, X_test_gendered, y_traindev_gendered, y_test_gendered = train_test_split(X_res_gendered, y_res_gendered, test_size=0.10, random_state=224)
X_train_gendered, X_dev_gendered, y_train_gendered, y_dev_gendered = train_test_split(X_traindev_gendered, y_traindev_gendered, test_size=0.1111, random_state=224)

X_traindev_not_gendered, X_test_not_gendered, y_traindev_not_gendered, y_test_not_gendered = train_test_split(X_res_not_gendered, y_res_not_gendered, test_size=0.10, random_state=224)
X_train_not_gendered, X_dev_not_gendered, y_train_not_gendered, y_dev_not_gendered = train_test_split(X_traindev_not_gendered, y_traindev_not_gendered, test_size=0.1111, random_state=224)

In [80]:
X_train_gendered.to_csv("X_train_gendered.csv")
X_dev_gendered.to_csv("X_dev_gendered.csv")
X_test_gendered.to_csv("X_test_gendered.csv")
y_train_gendered.to_csv("y_train_gendered.csv")
y_dev_gendered.to_csv("y_dev_gendered.csv")
y_test_gendered.to_csv("y_test_gendered.csv")

X_train_not_gendered.to_csv("X_train_not_gendered.csv")
X_dev_not_gendered.to_csv("X_dev_not_gendered.csv")
X_test_not_gendered.to_csv("X_test_not_gendered.csv")
y_train_not_gendered.to_csv("y_train_not_gendered.csv")
y_dev_not_gendered.to_csv("y_dev_not_gendered.csv")
y_test_not_gendered.to_csv("y_test_not_gendered.csv")

## Splitting Data (Not Oversampling)

In [14]:
X_traindev_gendered, X_test_gendered, y_traindev_gendered, y_test_gendered = train_test_split(X_gendered, y_gendered, test_size=0.10, random_state=224)
X_train_gendered, X_dev_gendered, y_train_gendered, y_dev_gendered = train_test_split(X_traindev_gendered, y_traindev_gendered, test_size=0.1111, random_state=224)

X_traindev_not_gendered, X_test_not_gendered, y_traindev_not_gendered, y_test_not_gendered = train_test_split(X_not_gendered, y_not_gendered, test_size=0.10, random_state=224)
X_train_not_gendered, X_dev_not_gendered, y_train_not_gendered, y_dev_not_gendered = train_test_split(X_traindev_not_gendered, y_traindev_not_gendered, test_size=0.1111, random_state=224)


## Generate Samples for Human Baseline

In [15]:
baseline_not_gendered_1 = X_test_not_gendered.iloc[:50]
baseline_not_gendered_1.to_csv("baseline_not_gendered_1.csv")

baseline_not_gendered_1_verdict = y_test_not_gendered.iloc[:50]
baseline_not_gendered_1_verdict.to_csv("baseline_not_gendered_1_verdict.csv")

In [16]:
baseline_not_gendered_2 = X_test_not_gendered.iloc[50:100]
baseline_not_gendered_2.to_csv("baseline_not_gendered_2.csv")

baseline_not_gendered_2_verdict = y_test_not_gendered.iloc[50:100]
baseline_not_gendered_2_verdict.to_csv("baseline_not_gendered_2_verdict.csv")

In [17]:
baseline_not_gendered_3 = X_test_not_gendered.iloc[100:150]
baseline_not_gendered_3.to_csv("baseline_not_gendered_3.csv")

baseline_not_gendered_3_verdict = y_test_not_gendered.iloc[100:150]
baseline_not_gendered_3_verdict.to_csv("baseline_not_gendered_3_verdict.csv")

In [18]:
baseline_not_gendered_4 = X_test_not_gendered.iloc[150:200]
baseline_not_gendered_4.to_csv("baseline_not_gendered_4.csv")

baseline_not_gendered_4_verdict = y_test_not_gendered.iloc[150:200]
baseline_not_gendered_4_verdict.to_csv("baseline_not_gendered_4_verdict.csv")