In [10]:
## 6.a
import boto3, botocore
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score
from itertools import product

## fetch file content from s3
s3 = boto3.resource('s3')
bucket = s3.Bucket('danhtran358-data-445-bucket')

bucket_object = bucket.Object('project_cleaned_data.csv')
## read file content to data-frame
diabetes_cleaned = pd.read_csv(bucket_object.get().get('Body'))
diabetes_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,1
388,1,128,88,39,110,36.5,1.057,37,1
389,2,88,58,26,16,28.4,0.766,22,0
390,10,101,76,48,180,32.9,0.171,63,0


In [11]:
bucket_object = bucket.Object('project_cleaned_data_extended_after_LASSO.csv')
## read file content to data-frame
diabetes_extended = pd.read_csv(bucket_object.get().get('Body'))
diabetes_extended

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,35,33.6,0.627,50,1
1,1,85,29,26.6,0.351,31,0
2,1,89,23,28.1,0.167,21,0
3,0,137,35,43.1,2.288,33,1
4,3,78,32,31.0,0.248,26,1
...,...,...,...,...,...,...,...
529,9,170,31,44.0,0.403,43,1
530,10,101,48,32.9,0.171,63,0
531,2,122,27,36.8,0.340,27,0
532,5,121,23,26.2,0.245,30,0


In [12]:
## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(file_name, data_frame):
    ## file object in s3 bucket
    data_file = bucket.Object(file_name)
    
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file.put(Body = content)
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_rf_data(file_name):
    try:
        ## file object in s3 bucket
        rf_data_file = bucket.Object(file_name)
        
        rf_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(rf_dictionary)
            
            ## will not work on extended data with 8 feature columns
            results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(rf_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(rf_data_file.get().get('Body'))

    
## function to read AdaBoosting/Gradient Boosting data stored in s3 csv to dataframe
def read_boosting_data(file_name):
    try:
        ## file object in s3 bucket
        boosting_data_file = bucket.Object(file_name)
        
        boosting_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(boosting_dictionary)
            
            ## will not work on extended data with 8 feature columns
            results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(boosting_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(boosting_data_file.get().get('Body'))
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_data_from_s3(file_name, X = None):
    try:
        ## file object in s3 bucket
        data_file = bucket.Object(file_name)
        
        data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            if X is None:
                results = expand_grid(dictionary)

                ## will not work on extended data with 8 feature columns
                results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)

                ## create columns for all types of cut-off values and scores
                for i in range(len(cut_off)):
                    for j in range(len(score_to_evaluate)):
                        col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                        results[col] = 0.0
            
            else:
                ## empty dataframe with first row has 0 for total loops
                empty_list = list()
                results = pd.DataFrame(empty_list, columns = X.columns)
                results.at[0, 'total_loops'] = 0
                   
            ## write brand new and empty file to s3
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(data_file.get().get('Body'))

In [13]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']

## List to store feature importances
all_feature_importances = list()

## read ensemble feature importances data stored in s3 file
data_file_name = 'project_ensemble_feature_importances.csv'
results = read_data_from_s3(data_file_name, X)

for loop_number in range(int(results.at[0, 'total_loops']), 1000):
    ## Split data
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## Build models and store feature importances
    md_rf = RandomForestClassifier(max_depth = 3, n_estimators = 500).fit(X_train, Y_train)
    all_feature_importances.append(md_rf.feature_importances_)
    
    md_ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    all_feature_importances.append(md_ada.feature_importances_)
    
    md_grad = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    all_feature_importances.append(md_grad.feature_importances_)
    
    all_feature_importances_df = pd.DataFrame(all_feature_importances, columns = X.columns)
    all_feature_importances_df['total_loops'] = loop_number + 1
    
    write_data_to_s3(data_file_name, all_feature_importances_df)
    
## Calculate the average importances of variables across 100 splits and 3 models
results = read_data_from_s3(data_file_name, X.columns)
print(np.mean(results, axis = 0))

Pregnancies                    0.067908
Glucose                        0.327742
BloodPressure                  0.051193
SkinThickness                  0.054225
Insulin                        0.136200
BMI                            0.110167
DiabetesPedigreeFunction       0.105079
Age                            0.147487
total_loops                 1000.000000
dtype: float64


In [14]:
## The clean data from project_cleaned_data.csv has missing value observations deleted

## This process only cleans missing value observation in the columns kept after dropping less important columns
bucket_object = bucket.Object('diabetes.csv')
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## read file content to data-frame
diabetes_not_cleaned = pd.read_csv(file_content_stream)

## dropping columns less important
diabetes_important = diabetes_not_cleaned.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])

## Preprocessing - Clean missing data values
## Glucose missing values
diabetes_important = diabetes_important.loc[diabetes_important['Glucose'] != 0]

## SkinThickness missing values
diabetes_important = diabetes_important.loc[diabetes_important['Insulin'] != 0]

## BMI missing values
diabetes_important = diabetes_important.loc[diabetes_important['BMI'] != 0]

diabetes_important = diabetes_important.reset_index(drop = True)

diabetes_important

Unnamed: 0,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,89,94,28.1,0.167,21,0
1,137,168,43.1,2.288,33,1
2,78,88,31.0,0.248,26,1
3,197,543,30.5,0.158,53,1
4,189,846,30.1,0.398,59,1
...,...,...,...,...,...,...
387,181,510,43.3,0.222,26,1
388,128,110,36.5,1.057,37,1
389,88,16,28.4,0.766,22,0
390,101,180,32.9,0.171,63,0


In [15]:
## write cleaned data to s3 bucket
write_data_to_s3('project_cleaned_data_extended_after_feature_importances.csv', diabetes_important)

In [16]:
## Use dataframes to store parameters to build models and store total scores
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

rf_dictionary = {'extended_data' : ['Y', 'N'], 'input_layer': [5, 6, 8], 'total_loops' : [0],
                 'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7]}
boosting_dictionary = {'extended_data' : ['Y', 'N'], 'input_layer': [5, 6, 8], 'total_loops' : [0],
                       'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}

## lists of cut-off values and types of score to evaluate models
cut_off = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65]
score_to_evaluate = ['precision', 'recall', 'f1']


In [17]:
## update the scores in result dataset after each model is built
def update_result_scores(pred, Y_test, results, combo_number):
    
    for cut_off_id in range(len(cut_off)):
        
        ## classify labels
        current_cut_off = cut_off[cut_off_id]
        pred_labels = np.where(pred < current_cut_off, 0, 1)
        
        for score_id in range(len(score_to_evaluate)):
            
            ## updated the appropriate score
            current_score = score_to_evaluate[score_id]
            score_column = str(current_cut_off) + '_' + current_score
            if current_score == 'precision':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + precision_score(Y_test, pred_labels, zero_division = 0)
            
            elif current_score == 'recall':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + recall_score(Y_test, pred_labels)
                
            elif current_score == 'f1': 
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + f1_score(Y_test, pred_labels)

In [32]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_important = X.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']
X_important_extended = diabetes_important.drop(columns = ['Outcome'])
Y_important_extended = diabetes_important['Outcome']

## read Random Forest data stored in s3 file
data_file_name = 'project_rf_data.csv'
results = read_rf_data(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[0, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 5:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important, Y, test_size = 0.2, stratify = Y)
            
            elif parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            elif parameters['input_layer'] == 8:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 5:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important_extended, Y_important_extended, test_size = 0.2, stratify = Y_important_extended)
                
            elif parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
        
        ## Building model
        md_rf = RandomForestClassifier(max_depth = parameters['depth'],
                                       n_estimators = int(parameters['n_tree'])).fit(X_train, Y_train)

        ## Predicting
        pred = md_rf.predict_proba(X_test)[:, 1]
        
        update_result_scores(pred, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)
    

In [19]:
loops_run = results.at[0, 'total_loops']
score_to_check = 'f1'

cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.1_f1,index.1,0.15_f1,index.2,0.2_f1,index.3,0.25_f1,index.4,0.3_f1,...,index.5,0.45_f1,index.6,0.5_f1,index.7,0.55_f1,index.8,0.6_f1,index.9,0.65_f1
0,14,0.618638,5,0.662890,40,0.693587,34,0.712829,9,0.717435,...,41,0.685717,40,0.665861,11,0.648308,43,0.618995,43,0.569343
1,32,0.617936,14,0.662693,14,0.692045,40,0.709865,33,0.716882,...,8,0.684180,11,0.665586,41,0.643648,11,0.616428,5,0.568850
2,35,0.615984,38,0.662306,34,0.690986,10,0.709686,30,0.716565,...,43,0.683426,43,0.665274,43,0.642135,41,0.615657,40,0.566069
3,8,0.615832,41,0.661814,35,0.690911,4,0.708205,6,0.716149,...,11,0.682434,41,0.665210,44,0.640492,8,0.612064,11,0.562108
4,38,0.615765,35,0.660997,5,0.690291,43,0.706970,34,0.715982,...,1,0.681671,8,0.664792,1,0.639712,40,0.611674,41,0.561166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,51,0.546040,69,0.590930,54,0.637729,51,0.666353,48,0.675689,...,15,0.643942,66,0.615132,45,0.568015,60,0.488070,45,0.394826
71,66,0.546038,66,0.590632,51,0.636424,54,0.665004,47,0.674073,...,27,0.643756,60,0.614223,60,0.568008,63,0.485459,63,0.374775
72,63,0.545409,63,0.589754,63,0.635688,60,0.660776,45,0.673322,...,45,0.640844,45,0.613370,72,0.565159,69,0.482761,69,0.374487
73,60,0.544471,60,0.586559,60,0.634519,72,0.658719,51,0.673239,...,48,0.640204,69,0.613312,69,0.561900,72,0.477412,72,0.369409


In [28]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_important = X.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']
X_important_extended = diabetes_important.drop(columns = ['Outcome'])
Y_important_extended = diabetes_important['Outcome']

## read Random Forest data stored in s3 file
data_file_name = 'project_ada_data.csv'
results = read_boosting_data(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[0, 'total_loops'], 100):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 5:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important, Y, test_size = 0.2, stratify = Y)
            
            elif parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            elif parameters['input_layer'] == 8:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 5:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important_extended, Y_important_extended, test_size = 0.2, stratify = Y_important_extended)
                
            elif parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
        
        ## Building model
        md_ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = parameters['depth']),
                                    n_estimators = int(parameters['n_tree']),
                                    learning_rate = parameters['learning_rate']).fit(X_train, Y_train)

        ## Predicting
        pred = md_ada.predict_proba(X_test)[:, 1]
        
        update_result_scores(pred, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)
    

In [21]:
loops_run = results.at[0, 'total_loops']
score_to_check = 'f1'

cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.1_f1,index.1,0.15_f1,index.2,0.2_f1,index.3,0.25_f1,index.4,0.3_f1,...,index.5,0.45_f1,index.6,0.5_f1,index.7,0.55_f1,index.8,0.6_f1,index.9,0.65_f1
0,14,0.667414,95,0.663397,2,0.679607,2,0.691871,91,0.701280,...,11,0.696928,101,0.677585,182,0.647860,6,0.633589,6,0.631875
1,5,0.661678,2,0.663054,20,0.678487,20,0.690570,101,0.701062,...,101,0.689673,2,0.677565,2,0.645428,116,0.633365,116,0.630973
2,130,0.660284,5,0.662510,1,0.676641,91,0.689503,119,0.697440,...,2,0.685089,11,0.675588,11,0.640421,7,0.629865,7,0.630588
3,31,0.658536,14,0.661392,101,0.675418,191,0.688597,2,0.693262,...,146,0.680462,146,0.672976,146,0.639444,35,0.628475,35,0.627427
4,22,0.658046,11,0.659234,91,0.674256,101,0.688375,20,0.692221,...,91,0.676992,182,0.665622,92,0.636439,97,0.628441,97,0.627217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,162,0.507673,63,0.521804,63,0.538092,72,0.559610,81,0.583863,...,159,0.579931,166,0.575120,81,0.531796,73,0.420966,82,0.297636
221,81,0.507161,163,0.521516,82,0.537973,82,0.558810,89,0.582104,...,175,0.577142,175,0.573592,163,0.530269,63,0.420084,54,0.278128
222,108,0.505109,126,0.521250,72,0.537767,163,0.558578,147,0.580895,...,89,0.576553,163,0.571496,73,0.527349,82,0.419989,63,0.261450
223,36,0.504292,36,0.519006,163,0.534893,63,0.558242,150,0.579555,...,150,0.573114,150,0.569909,82,0.526971,72,0.417010,72,0.251579


In [30]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_important = X.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']
X_important_extended = diabetes_important.drop(columns = ['Outcome'])
Y_important_extended = diabetes_important['Outcome']

## read Random Forest data stored in s3 file
data_file_name = 'project_grad_data.csv'
results = read_boosting_data(data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[0, 'total_loops'], 1):
    
    ## Build SVC models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 5:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important, Y, test_size = 0.2, stratify = Y)
            
            elif parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            elif parameters['input_layer'] == 8:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 5:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_important_extended, Y_important_extended, test_size = 0.2, stratify = Y_important_extended)
                
            elif parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
        
        ## Building model
        md_grad = GradientBoostingClassifier(max_depth = parameters['depth'],
                                             n_estimators = int(parameters['n_tree']),
                                             learning_rate = parameters['learning_rate']).fit(X_train, Y_train)
        ## Predicting
        pred = md_grad.predict_proba(X_test)[:, 1]
        
        update_result_scores(pred, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)
    

In [24]:
loops_run = results.at[0, 'total_loops']
score_to_check = 'f1'

cut_off_value = cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(cut_off)):
    
    column_name = str(cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.1_f1,index.1,0.15_f1,index.2,0.2_f1,index.3,0.25_f1,index.4,0.3_f1,...,index.5,0.45_f1,index.6,0.5_f1,index.7,0.55_f1,index.8,0.6_f1,index.9,0.65_f1
0,121,0.696496,121,0.696291,13,0.701872,38,0.709984,38,0.716312,...,10,0.679698,10,0.664976,124,0.662398,124,0.659273,124,0.657016
1,3,0.695587,13,0.696260,100,0.700979,100,0.707656,110,0.708775,...,38,0.678857,121,0.664694,121,0.660932,121,0.655444,121,0.646904
2,193,0.691465,3,0.694900,190,0.698487,190,0.706042,29,0.707123,...,41,0.677422,124,0.664333,22,0.656281,22,0.650791,105,0.645824
3,22,0.688533,193,0.692712,0,0.698058,10,0.701291,119,0.705143,...,121,0.671819,22,0.662650,25,0.656085,34,0.648508,34,0.644730
4,93,0.687928,22,0.691744,19,0.695663,29,0.699185,101,0.704957,...,128,0.671131,0,0.660982,103,0.654863,105,0.648442,133,0.643713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,101,0.495238,194,0.495238,140,0.495238,182,0.495238,95,0.495022,...,143,0.000000,98,0.000000,140,0.000000,95,0.000000,56,0.000000
221,98,0.495238,191,0.495238,143,0.495238,185,0.495238,182,0.494806,...,50,0.000000,92,0.000000,137,0.000000,92,0.000000,140,0.000000
222,95,0.495238,182,0.495238,149,0.495089,2,0.495238,137,0.494590,...,47,0.000000,2,0.000000,98,0.000000,56,0.000000,53,0.000000
223,94,0.495238,188,0.495238,17,0.494795,143,0.495238,92,0.494590,...,182,0.000000,185,0.000000,92,0.000000,53,0.000000,50,0.000000


## Reviewing

In [27]:
## lists of cut-off values to review scores
review_cut_off = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]

loops_run = results.at[0, 'total_loops']
score_to_check = 'f1'

cut_off_value = review_cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.15_f1,index.1,0.2_f1,index.2,0.25_f1,index.3,0.3_f1,index.4,0.35_f1,index.5,0.4_f1,index.6,0.45_f1
0,5,0.662890,40,0.693587,34,0.712829,9,0.717435,33,0.718897,33,0.699408,41,0.685717
1,14,0.662693,14,0.692045,40,0.709865,33,0.716882,9,0.715455,43,0.698612,8,0.684180
2,38,0.662306,34,0.690986,10,0.709686,30,0.716565,34,0.712065,44,0.698283,43,0.683426
3,41,0.661814,35,0.690911,4,0.708205,6,0.716149,43,0.710915,35,0.696720,11,0.682434
4,35,0.660997,5,0.690291,43,0.706970,34,0.715982,36,0.710799,8,0.696576,1,0.681671
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,69,0.590930,54,0.637729,51,0.666353,48,0.675689,59,0.669802,45,0.654076,15,0.643942
71,66,0.590632,51,0.636424,54,0.665004,47,0.674073,48,0.664087,57,0.653477,27,0.643756
72,63,0.589754,63,0.635688,60,0.660776,45,0.673322,51,0.660220,51,0.651387,45,0.640844
73,60,0.586559,60,0.634519,72,0.658719,51,0.673239,45,0.659051,48,0.647252,48,0.640204


In [29]:
loops_run = results.at[0, 'total_loops']
score_to_check = 'f1'

cut_off_value = review_cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.15_f1,index.1,0.2_f1,index.2,0.25_f1,index.3,0.3_f1,index.4,0.35_f1,index.5,0.4_f1,index.6,0.45_f1
0,2,0.664313,2,0.680542,2,0.692565,101,0.700598,101,0.700428,2,0.697495,11,0.695868
1,95,0.663461,20,0.680123,20,0.692317,91,0.699547,20,0.699998,101,0.696895,101,0.689650
2,5,0.661641,101,0.674285,191,0.689717,119,0.694876,119,0.695355,11,0.696591,2,0.685917
3,11,0.658757,191,0.673804,91,0.688486,20,0.694334,11,0.695164,20,0.689578,146,0.680667
4,14,0.658620,91,0.673186,101,0.687355,2,0.693929,2,0.695024,91,0.689179,91,0.677516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,63,0.521730,63,0.538076,72,0.560022,159,0.583179,197,0.583075,197,0.581164,175,0.579144
221,163,0.521589,72,0.537918,163,0.559082,147,0.582811,159,0.583070,159,0.580784,159,0.579118
222,126,0.521125,82,0.537744,82,0.558493,89,0.582057,147,0.582837,89,0.580095,89,0.576614
223,36,0.518925,163,0.535026,63,0.558276,150,0.579686,89,0.580178,147,0.579284,147,0.573861


In [31]:
loops_run = results.at[0, 'total_loops']
score_to_check = 'f1'

cut_off_value = review_cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.15_f1,index.1,0.2_f1,index.2,0.25_f1,index.3,0.3_f1,index.4,0.35_f1,index.5,0.4_f1,index.6,0.45_f1
0,13,0.696315,100,0.702242,38,0.709207,38,0.715875,38,0.711122,38,0.699149,10,0.679506
1,121,0.695485,13,0.701529,100,0.708912,110,0.709136,119,0.706657,128,0.691565,41,0.678773
2,3,0.695250,190,0.698185,190,0.705638,29,0.707160,29,0.704485,91,0.690437,38,0.677588
3,193,0.693615,0,0.697889,10,0.700782,119,0.706427,20,0.704047,119,0.690165,128,0.670452
4,22,0.691865,19,0.696784,29,0.699762,101,0.705564,91,0.703795,1,0.690119,109,0.670303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,188,0.495238,140,0.495238,182,0.495238,95,0.495025,42,0.527256,92,0.000000,92,0.000000
221,191,0.495238,143,0.495238,185,0.495238,182,0.494813,222,0.515967,137,0.000000,98,0.000000
222,194,0.495238,149,0.494947,2,0.495238,137,0.494600,87,0.512596,2,0.000000,2,0.000000
223,152,0.495238,17,0.494802,92,0.495238,92,0.494600,168,0.501903,182,0.000000,95,0.000000


## Reviewing

In [33]:
## lists of cut-off values to review scores

loops_run = results.at[0, 'total_loops']
score_to_check = 'f1'

cut_off_value = review_cut_off[0]
column_name = str(cut_off_value) + '_' + score_to_check

all_f1_scores = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()

for cut_off_value in range(1, len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
    next_f1_score = pd.DataFrame(results[column_name].sort_values(ascending = [False]) / loops_run).reset_index()
    
    all_f1_scores = pd.concat([all_f1_scores, next_f1_score], axis = 1)

all_f1_scores

Unnamed: 0,index,0.3_f1,index.1,0.35_f1
0,9,0.717435,33,0.718897
1,33,0.716882,9,0.715455
2,30,0.716565,34,0.712065
3,6,0.716149,43,0.710915
4,34,0.715982,36,0.710799
...,...,...,...,...
70,48,0.675689,59,0.669802
71,47,0.674073,48,0.664087
72,45,0.673322,51,0.660220
73,51,0.673239,45,0.659051


In [37]:
## Reviewing cut off from above dataframe
review_cut_off = [0.3, 0.35]

## read SVC data stored in s3 file
data_file_name = 'project_rf_data.csv'
results = read_data_from_s3(data_file_name)

loops_run = results.at[0, 'total_loops']

review_df = pd.DataFrame()

for cut_off_value in range(len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_f1'
    next_f1_score_index = pd.DataFrame(results[column_name].sort_values(ascending = [False])).index
    score_columns = list()
    
    for score_to_check in score_to_evaluate:
        
        column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
        
        score_columns.append(column_name)

    parameter_df = results[['extended_data', 'input_layer', 'n_tree', 'depth']].loc[next_f1_score_index].reset_index(drop = True)
    score_df = results[score_columns].loc[next_f1_score_index].reset_index(drop = True) / loops_run
    
    review_df = pd.concat([review_df, parameter_df, score_df], axis = 1)

review_df = review_df.rename(columns = {'extended_data':'ext', 'input_layer':'input'})
review_df

Unnamed: 0,ext,input,n_tree,depth,0.3_precision,0.3_recall,0.3_f1,ext.1,input.1,n_tree.1,depth.1,0.35_precision,0.35_recall,0.35_f1
0,Y,5,1500,3,0.620882,0.852692,0.717435,N,5,500,3,0.646330,0.814231,0.718897
1,N,5,500,3,0.617126,0.858846,0.716882,Y,5,1500,3,0.649484,0.801154,0.715455
2,N,5,100,3,0.620290,0.854231,0.716565,N,5,500,5,0.648800,0.793462,0.712065
3,Y,5,1000,3,0.620393,0.851923,0.716149,N,5,2000,5,0.640990,0.804231,0.710915
4,N,5,500,5,0.626628,0.839231,0.715982,N,5,1000,3,0.639442,0.806538,0.710799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,N,6,500,3,0.576162,0.822308,0.675689,N,6,2000,7,0.614098,0.743846,0.669802
71,N,6,100,7,0.576575,0.816923,0.674073,N,6,500,3,0.605812,0.741538,0.664087
72,N,6,100,3,0.577453,0.811923,0.673322,N,6,1000,3,0.594814,0.748846,0.660220
73,N,6,1000,3,0.568713,0.829615,0.673239,N,6,100,3,0.611573,0.720385,0.659051


## The best random forest was built on cleaned dataset with 5 important features using 500 trees and depth of 3, cut-off value of 0.35

In [38]:
## Reviewing cut off from above dataframe
review_cut_off = [0.3, 0.35]

## read SVC data stored in s3 file
data_file_name = 'project_ada_data.csv'
results = read_data_from_s3(data_file_name)

loops_run = results.at[0, 'total_loops']

review_df = pd.DataFrame()

for cut_off_value in range(len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_f1'
    next_f1_score_index = pd.DataFrame(results[column_name].sort_values(ascending = [False])).index
    score_columns = list()
    
    for score_to_check in score_to_evaluate:
        
        column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
        
        score_columns.append(column_name)

    parameter_df = results[['extended_data', 'input_layer', 'n_tree', 'depth', 'learning_rate']].loc[next_f1_score_index].reset_index(drop = True)
    score_df = results[score_columns].loc[next_f1_score_index].reset_index(drop = True) / loops_run
    
    review_df = pd.concat([review_df, parameter_df, score_df], axis = 1)

review_df = review_df.rename(columns = {'extended_data':'ext', 'input_layer':'input'})
review_df

Unnamed: 0,ext,input,n_tree,depth,learning_rate,0.3_precision,0.3_recall,0.3_f1,ext.1,input.1,n_tree.1,depth.1,learning_rate.1,0.35_precision,0.35_recall,0.35_f1
0,N,5,500,3,0.001,0.643165,0.775148,0.700620,N,5,500,3,0.001,0.660379,0.751479,0.700008
1,N,5,100,3,0.010,0.632191,0.789941,0.698202,Y,5,1000,3,0.001,0.648391,0.764053,0.698271
2,N,5,1500,3,0.001,0.610282,0.817308,0.696401,N,5,1500,3,0.001,0.637234,0.776627,0.696976
3,Y,5,100,3,0.001,0.641686,0.767012,0.695694,Y,5,100,3,0.001,0.646286,0.758136,0.695446
4,Y,5,1000,3,0.001,0.617120,0.798077,0.693161,Y,5,500,3,0.001,0.641541,0.765533,0.694845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,Y,6,500,7,0.010,0.609061,0.564637,0.583187,Y,6,500,7,0.010,0.611666,0.561432,0.582586
221,N,6,1000,7,0.100,0.594570,0.576923,0.582266,N,8,100,7,0.100,0.617272,0.559172,0.582183
222,Y,6,2000,7,0.001,0.610806,0.560363,0.580632,N,6,1000,7,0.100,0.598362,0.573225,0.582160
223,Y,6,2000,3,0.010,0.420851,0.938034,0.580148,Y,6,2000,7,0.001,0.612509,0.555556,0.578789


## The best AdaBoost model was built on cleaned dataset with 5 important features using 500 trees, depth of 3, learning rate 0.001, cut-off values of 0.3 or 0.35

In [39]:
## Reviewing cut off from above dataframe
review_cut_off = [0.3, 0.35]

## read SVC data stored in s3 file
data_file_name = 'project_grad_data.csv'
results = read_data_from_s3(data_file_name)

loops_run = results.at[0, 'total_loops']

review_df = pd.DataFrame()

for cut_off_value in range(len(review_cut_off)):
    
    column_name = str(review_cut_off[cut_off_value]) + '_f1'
    next_f1_score_index = pd.DataFrame(results[column_name].sort_values(ascending = [False])).index
    score_columns = list()
    
    for score_to_check in score_to_evaluate:
        
        column_name = str(review_cut_off[cut_off_value]) + '_' + score_to_check
        
        score_columns.append(column_name)

    parameter_df = results[['extended_data', 'input_layer', 'n_tree', 'depth', 'learning_rate']].loc[next_f1_score_index].reset_index(drop = True)
    score_df = results[score_columns].loc[next_f1_score_index].reset_index(drop = True) / loops_run
    
    review_df = pd.concat([review_df, parameter_df, score_df], axis = 1)

review_df = review_df.rename(columns = {'extended_data':'ext', 'input_layer':'input'})
review_df

Unnamed: 0,ext,input,n_tree,depth,learning_rate,0.3_precision,0.3_recall,0.3_f1,ext.1,input.1,n_tree.1,depth.1,learning_rate.1,0.35_precision,0.35_recall,0.35_f1
0,Y,5,2000,3,0.001,0.646419,0.804348,0.714625,Y,5,2000,3,0.001,0.668795,0.763099,0.709873
1,N,5,1000,3,0.001,0.627716,0.818841,0.708224,N,5,1500,3,0.001,0.658537,0.767559,0.706732
2,Y,5,1500,3,0.001,0.627265,0.815496,0.707264,Y,5,1500,3,0.001,0.650660,0.774247,0.704815
3,N,5,1500,3,0.001,0.628786,0.810479,0.706007,Y,5,1000,3,0.001,0.652617,0.769788,0.704234
4,N,5,500,3,0.001,0.622576,0.821628,0.705440,N,5,100,3,0.010,0.646607,0.777592,0.704049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,N,5,100,5,0.001,0.328989,0.999443,0.495028,Y,5,2000,7,0.100,0.706247,0.427536,0.527047
221,N,8,100,3,0.001,0.328865,0.998885,0.494819,N,8,2000,7,0.100,0.697395,0.418060,0.516578
222,N,6,100,3,0.001,0.328740,0.998328,0.494609,Y,6,2000,7,0.100,0.671409,0.421095,0.513908
223,N,5,100,3,0.001,0.328740,0.998328,0.494609,N,6,1500,7,0.100,0.666148,0.412486,0.503647


## The best Gradient Boosing model was built on extended dataset with 5 important features using 2000 trees, learning rate 0.001, and depth of 3, cut-off values of 0.3 or 0.35
## The second best Gradient Boosing model was built on cleaned dataset with 5 important features using 1000 or 1500 trees, learning rate 0.001, and depth of 3, cut-off values of 0.3 or 0.35