In [1]:
import numpy
import pandas
import Cleaner
import sklearn
import sklearn.model_selection #train_test_split, GridSearchCV
import sklearn.ensemble #RandomForestClassifier
import sklearn.metrics #accuracy_score, precision_score, recall_score

In [2]:
Cleaner.Clean("b765dc3d8076-trainset.csv")
Cleaner.Clean("b765dc3d8076-testset_for_participants.csv")

In [3]:
df = pandas.read_csv('b765dc3d8076-trainset_cleaned.csv')

Train, temp = sklearn.model_selection.train_test_split(df, test_size=0.4, random_state=2026)

Validation, Testing = sklearn.model_selection.train_test_split(temp, test_size=0.5, random_state=2026)

# Spilt into Train = 60%, Validation = 20%, Testing = 20%

In [4]:
train_features = Train.drop(columns = ['FRAUD_NONFRAUD'])
train_labels = Train['FRAUD_NONFRAUD']

val_features = Validation.drop(columns = ['FRAUD_NONFRAUD'])
val_labels = Validation['FRAUD_NONFRAUD']

test_features = Testing.drop(columns = ['FRAUD_NONFRAUD'])
test_labels = Testing['FRAUD_NONFRAUD']

In [5]:
# function to print out results the GridSearchCV returns
def print_results(results):
    means_lst = results.cv_results_['mean_test_score']
    parameters_lst = results.cv_results_['params' ]
    sorted_parameters = sorted(enumerate(parameters_lst), key = lambda x: means_lst[x[0]] , reverse=True )
    sorted_parameters = map(lambda x: (x[1],means_lst[x[0]]) , sorted_parameters)
    print("sorted")
    for x in list(sorted_parameters):
        print(x[0], round(x[1],4) )

In [6]:
rf = sklearn.ensemble.RandomForestClassifier()
parameters = { 'n_estimators' : [10,20,30,40,50,60,70,80,90,100], 'max_depth' : [8,10,12,15,20,25,None] }

# for breaking a set into 5 subsets (cv=5), and all the combination of parameters
cv = sklearn.model_selection.GridSearchCV(rf, parameters, cv=5)

# try each combination of parameters on the data, trianing on 4 or the 5 subsets, and testing on one of the subsets
cv.fit(train_features , train_labels)

# This takes awhile to run with the amount of parameters choosen, I probally could lower the number of differnt parameters 
# if I was working with a larger data set

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [8, 10, 12, 15, 20, 25, None],
                         'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100]})

In [7]:
print_results(cv)

sorted
{'max_depth': None, 'n_estimators': 80} 0.9526
{'max_depth': 12, 'n_estimators': 50} 0.952
{'max_depth': 15, 'n_estimators': 80} 0.952
{'max_depth': 20, 'n_estimators': 100} 0.952
{'max_depth': 12, 'n_estimators': 100} 0.9519
{'max_depth': None, 'n_estimators': 70} 0.9518
{'max_depth': 15, 'n_estimators': 90} 0.9517
{'max_depth': 25, 'n_estimators': 100} 0.9517
{'max_depth': None, 'n_estimators': 90} 0.9517
{'max_depth': 12, 'n_estimators': 70} 0.9515
{'max_depth': 20, 'n_estimators': 90} 0.9514
{'max_depth': 25, 'n_estimators': 90} 0.9514
{'max_depth': None, 'n_estimators': 50} 0.9514
{'max_depth': 10, 'n_estimators': 90} 0.9513
{'max_depth': 12, 'n_estimators': 80} 0.9513
{'max_depth': 10, 'n_estimators': 50} 0.9512
{'max_depth': 15, 'n_estimators': 100} 0.9512
{'max_depth': 25, 'n_estimators': 50} 0.9512
{'max_depth': 10, 'n_estimators': 70} 0.9511
{'max_depth': 12, 'n_estimators': 90} 0.951
{'max_depth': 15, 'n_estimators': 50} 0.951
{'max_depth': 25, 'n_estimators': 80} 0.9

In [8]:
# refit on best three hyperpramenter paramtization

best_three = [(None,80),(12,50),(15,80)]

rf1 = sklearn.ensemble.RandomForestClassifier(max_depth=best_three[0][0] ,n_estimators=best_three[0][1] )
rf1.fit(train_features, train_labels)

rf2 = sklearn.ensemble.RandomForestClassifier(max_depth=best_three[1][0] ,n_estimators=best_three[1][1])
rf2.fit(train_features, train_labels)

rf3 = sklearn.ensemble.RandomForestClassifier(max_depth=best_three[2][0] ,n_estimators=best_three[2][1])
rf3.fit(train_features, train_labels)

RandomForestClassifier(max_depth=15, n_estimators=80)

In [18]:
print("Validation set")
for a_model in [rf1,rf2,rf3]:
    y_pred = a_model.predict(val_features)
    accuracy = sklearn.metrics.accuracy_score(val_labels, y_pred)
    precision = sklearn.metrics.precision_score(val_labels, y_pred)
    recall = sklearn.metrics.recall_score(val_labels, y_pred)
    print("max depth:", a_model.max_depth, "n-estimators:",a_model.n_estimators )
    print('   F1 Score:', (2*precision*recall)/(precision+recall) ,"\n")


print("\nTest set")
for a_model in [rf1,rf2,rf3]:
    y_pred = a_model.predict(test_features)
    accuracy = sklearn.metrics.accuracy_score(test_labels, y_pred)
    precision = sklearn.metrics.precision_score(test_labels, y_pred)
    recall = sklearn.metrics.recall_score(test_labels, y_pred)
    print("max depth:", a_model.max_depth, "n-estimators:",a_model.n_estimators)
    print('   F1 Score:', (2*precision*recall)/(precision+recall),"\n" )

Validation set
max depth: None n-estimators: 80
   F1 Score: 0.965989847715736 

max depth: 12 n-estimators: 50
   F1 Score: 0.9650278763304613 

max depth: 15 n-estimators: 80
   F1 Score: 0.9662864385297846 


Test set
max depth: None n-estimators: 80
   F1 Score: 0.9670936950514947 

max depth: 12 n-estimators: 50
   F1 Score: 0.9699549323985979 

max depth: 15 n-estimators: 80
   F1 Score: 0.9693928750627195 



In [None]:
# Will choose max_depth=15 n_estimators =80