Ethics of AI Final Project

In [9]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from datasets import load_dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import  DecisionTreeClassifier ,plot_tree, export_text# Decision tree algorithm and plotting functions for the Decision tree
from matplotlib import pyplot as plt # plotting/graphing


In [12]:
# fetch dataset and apply processing file
dataset = load_dataset("mstz/speeddating",trust_remote_code=True)["train"]

# convert to df
df = pd.DataFrame(dataset)


# split into x and y sets
X = df.drop(columns=['is_match','dater_wants_to_date','dated_wants_to_date']) # is match is our target variable, which is determined from the last 2 so we need to drop all
X = pd.get_dummies(X) #get dummies for X
y = df.is_match

X.columns


Index(['is_dater_male', 'dater_age', 'dated_age', 'age_difference',
       'are_same_race', 'same_race_importance_for_dater',
       'same_religion_importance_for_dater',
       'attractiveness_importance_for_dated', 'sincerity_importance_for_dated',
       'intelligence_importance_for_dated', 'humor_importance_for_dated',
       'ambition_importance_for_dated',
       'shared_interests_importance_for_dated',
       'attractiveness_score_of_dater_from_dated',
       'sincerity_score_of_dater_from_dated',
       'intelligence_score_of_dater_from_dated',
       'humor_score_of_dater_from_dated', 'ambition_score_of_dater_from_dated',
       'shared_interests_score_of_dater_from_dated',
       'attractiveness_importance_for_dater', 'sincerity_importance_for_dater',
       'intelligence_importance_for_dater', 'humor_importance_for_dater',
       'ambition_importance_for_dater',
       'shared_interests_importance_for_dater',
       'self_reported_attractiveness_of_dater',
       'self_repor

In [5]:
# split into training and tests, ALL RANDOM STATES SET TO 0
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .33,random_state=0)

In [8]:
# Different models
# Random Forest

#Set estimators as 100, random state 0, samples of at least 2 in each leaf
clf = RandomForestClassifier(n_estimators = 125, random_state=0, min_samples_leaf=2)
clf.fit(x_train, y_train) # fit model

# get the roc auc and pring
cross_val_accuracy_roc_auc = (cross_val_score(clf, x_train, y_train, cv = 10, scoring = 'roc_auc').mean()*100)

cross_val_accuracy_roc_auc

83.31332309553726

# 1 Pre Processing
Our dataset has a variety of features. We will want to simplify our dataset to have a few key features:
Age, age difference for the pairing (Sensitive)
Race, same race for the pairing (Sensitive)
expected_number_of_likes_of_dater_from_20_people (non sensitive)
Already met before (non sensitive)


In [27]:
features = ['dater_age','dated_age','age_difference','dater_race','dated_race','are_same_race','expected_number_of_likes_of_dater_from_20_people','already_met_before']

X = df[features].copy()
y = df.is_match.copy()

# We will want to see where the race is the same or not

print('Matches where pairings are the same race: ',len(X[X['are_same_race']]))
print('Matches where pairings are not the same race: ',len(X[-X['are_same_race']]))

Matches where pairings are the same race:  437
Matches where pairings are not the same race:  611


In [30]:
print('Matches where pairings are the same race and match: ',len(X[X['are_same_race'] & y==1]))
print('Matches where pairings are not the same race and match: ',len(X[-X['are_same_race'] & y==1]))

Matches where pairings are the same race and match:  84
Matches where pairings are not the same race and match:  102


# Split data and Examine Match Rates by Group

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state= 372)

print('% Same Race in train: ', len(X_train[X_train['are_same_race']])/len(X_train))
print('% Not Same Race in train: ', len(X_train[-X_train['are_same_race']])/len(X_train))

print('%  Same Race in test: ', len(X_test[X_test['are_same_race']])/len(X_test))
print('% Not Same Race test: ', len(X_test[-X_test['are_same_race']])/len(X_test))

#make a df where the couple is a match
train_is_match = X_train.join(y_train)
train_is_match = train_is_match[y_train==1]

print('Difference in Match Rate between Groups in Training Data: ', 
    ( (len(train_is_match[train_is_match['are_same_race']]) / len(X_train[X_train['are_same_race']]))
     -
    ( (len(train_is_match[-train_is_match['are_same_race']])) / len(X_train[-X_train['are_same_race']])))

)

% Same Race in train:  0.4092769440654843
% Not Same Race in train:  0.5907230559345157
%  Same Race in test:  0.43492063492063493
% Not Same Race test:  0.5650793650793651
Difference in Match Rate between Groups in Training Data:  0.029622786759045422
