In [1]:
pip install tensorflow

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
## 6.a
import boto3, botocore
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score
from itertools import product

## fetch file content from s3
s3 = boto3.resource('s3')
bucket = s3.Bucket('danhtran358-data-445-bucket')

bucket_object = bucket.Object('project_cleaned_data.csv')
## read file content to data-frame
diabetes_cleaned = pd.read_csv(bucket_object.get().get('Body'))
diabetes_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,1
388,1,128,88,39,110,36.5,1.057,37,1
389,2,88,58,26,16,28.4,0.766,22,0
390,10,101,76,48,180,32.9,0.171,63,0


In [2]:
bucket_object = bucket.Object('project_cleaned_data_extended_after_LASSO.csv')
## read file content to data-frame
diabetes_extended = pd.read_csv(bucket_object.get().get('Body'))
diabetes_extended

Unnamed: 0,Pregnancies,Glucose,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,35,33.6,0.627,50,1
1,1,85,29,26.6,0.351,31,0
2,1,89,23,28.1,0.167,21,0
3,0,137,35,43.1,2.288,33,1
4,3,78,32,31.0,0.248,26,1
...,...,...,...,...,...,...,...
529,9,170,31,44.0,0.403,43,1
530,10,101,48,32.9,0.171,63,0
531,2,122,27,36.8,0.340,27,0
532,5,121,23,26.2,0.245,30,0


In [3]:
## Use dataframes to store parameters to build models and store total scores
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

dictionary = {'extended_data' : ['Y', 'N'], 'input_layer': [6, 8], 'mid_layer_1': [2, 3, 4], 'mid_layer_2': [2, 3, 4], 'total_loops' : [0],
                 'mlp' : ['mlp1_tanh', 'mlp1_relu', 'mlp2_tanh', 'mlp2_relu', 'mlp2_tanh_relu', 'mlp2_relu_tanh']}

## lists of cut-off values and types of score to evaluate models
cut_off = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
score_to_evaluate = ['precision', 'recall', 'f1']

In [4]:
## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(file_name, data_frame):
    ## file object in s3 bucket
    data_file = bucket.Object(file_name)
    
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file.put(Body = content)
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_data_from_s3(file_name):
    try:
        ## file object in s3 bucket
        data_file = bucket.Object(file_name)
        
        data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(dictionary)
            
            ## will not work on extended data with 8 feature columns
            results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            ## write brand new and empty file to s3
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(data_file.get().get('Body'))

In [5]:
def mlp1_tanh_predict(X_train, X_test, Y_train, input_layer, mid_layer_1):
    ## Multilayer perceptron 1 mid layer tanh
    ## Define mlp structure
    mlp_md1 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'tanh'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md1.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    mlp_md1.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = 500, verbose = 0)

    ## Predict probability 
    predict_md1 = mlp_md1.predict(X_test)[:,1]
    
    return predict_md1
    

def mlp1_relu_predict(X_train, X_test, Y_train, input_layer, mid_layer_1):
    ## Multilayer perceptron 1 mid layer relu
    ## Define mlp structure
    mlp_md2 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'relu'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md2.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    mlp_md2.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = 500, verbose = 0)

    ## Predict probability
    predict_md2 = mlp_md2.predict(X_test)[:,1]

    return predict_md2


def mlp2_tanh_predict(X_train, X_test, Y_train, input_layer, mid_layer_1, mid_layer_2):
    ## Multilayer perceptron 2 mid layer, both tanh
    ## Define mlp structure
    mlp_md1 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'tanh'),
          tf.keras.layers.Dense(mid_layer_2, activation = 'tanh'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md1.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    mlp_md1.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = 500, verbose = 0)

    ## Predict probability 
    predict_md1 = mlp_md1.predict(X_test)[:,1]
    
    return predict_md1
    

def mlp2_relu_predict(X_train, X_test, Y_train, input_layer, mid_layer_1, mid_layer_2):
    ## Multilayer perceptron 2 layers, both relu
    ## Define mlp structure
    mlp_md2 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'relu'),
          tf.keras.layers.Dense(mid_layer_2, activation = 'relu'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md2.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    mlp_md2.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = 500, verbose = 0)

    ## Predict probability
    predict_md2 = mlp_md2.predict(X_test)[:,1]

    return predict_md2


def mlp2_tanh_relu_predict(X_train, X_test, Y_train, input_layer, mid_layer_1, mid_layer_2):
    ## Multilayer perceptron 2 layers, tanh and relu
    ## Define mlp structure
    mlp_md1 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'tanh'),
          tf.keras.layers.Dense(mid_layer_2, activation = 'relu'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md1.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    mlp_md1.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = 500, verbose = 0)

    ## Predict probability 
    predict_md1 = mlp_md1.predict(X_test)[:,1]
    
    return predict_md1
    

def mlp2_relu_tanh_predict(X_train, X_test, Y_train, input_layer, mid_layer_1, mid_layer_2):
    ## Multilayer perceptron 2 layers, relu and tanh
    ## Define mlp structure
    mlp_md2 = tf.keras.models.Sequential([
          tf.keras.layers.Dense(mid_layer_1, input_dim = input_layer, activation = 'relu'),
          tf.keras.layers.Dense(mid_layer_2, activation = 'tanh'),
          tf.keras.layers.Dense(2, activation = 'softmax')
    ])

    ## Compile and fit model to data
    mlp_md2.compile(optimizer = 'sgd', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    mlp_md2.fit(X_train, tf.keras.utils.to_categorical(Y_train), epochs = 100, batch_size = 500, verbose = 0)

    ## Predict probability
    predict_md2 = mlp_md2.predict(X_test)[:,1]

    return predict_md2

In [6]:
## build the appropriate model and update the result dataset after each model is built
def update_results(X_train, X_test, Y_train, Y_test, results, combo_number):
    parameters = results.loc[combo_number]
    
    if parameters['mlp'] == 'mlp1_tanh':
        pred = mlp1_tanh_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'])
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['mlp'] == 'mlp1_relu':
        pred = mlp1_relu_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'])
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['mlp'] == 'mlp2_tanh':
        pred = mlp2_tanh_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'], parameters['mid_layer_2'])
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['mlp'] == 'mlp2_relu':
        pred = mlp2_relu_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'], parameters['mid_layer_2'])
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['mlp'] == 'mlp2_tanh_relu':
        pred = mlp2_tanh_relu_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'], parameters['mid_layer_2'])
        update_result_scores(pred, Y_test, results, combo_number)
    
    elif parameters['mlp'] == 'mlp2_relu_tanh':
        pred = mlp2_relu_tanh_predict(X_train, X_test, Y_train, parameters['input_layer'], parameters['mid_layer_1'], parameters['mid_layer_2'])
        update_result_scores(pred, Y_test, results, combo_number)

## update the scores in result dataset after each model is built
def update_result_scores(pred, Y_test, results, combo_number):
    for cut_off_id in range(len(cut_off)):
        
        ## classify labels
        current_cut_off = cut_off[cut_off_id]
        pred_labels = np.where(pred < current_cut_off, 0, 1)
        
        for score_id in range(len(score_to_evaluate)):
            
            ## updated the appropriate score
            current_score = score_to_evaluate[score_id]
            score_column = str(current_cut_off) + '_' + current_score
            
            if current_score == 'precision':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + precision_score(Y_test, pred_labels, zero_division = 0)
            
            elif current_score == 'recall':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + recall_score(Y_test, pred_labels)
                
            elif current_score == 'f1': 
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + f1_score(Y_test, pred_labels)

In [21]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']
X_lasso = X.drop(columns = ['BloodPressure', 'Insulin'])
X_extended = diabetes_extended.drop(columns = ['Outcome'])
Y_extended = diabetes_extended['Outcome']

## read MLP data stored in s3 file
data_file_name = 'project_mlp_result.csv'
results = read_data_from_s3(data_file_name)

scaler = MinMaxScaler()

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(results.at[1, 'total_loops'], 100):
    
    ## Build MLP models for each parameter combination and store scores
    for combo_number in range(results.shape[0]):
        parameters = results.loc[combo_number]
        
        if parameters['extended_data'] == 'N':
            
            if parameters['input_layer'] == 6:
                ## cleaned data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_lasso, Y, test_size = 0.2, stratify = Y)
                
            else:
                ## cleaned data with all features
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
                
        else:
        
            if parameters['input_layer'] == 6:
                ## extended data with reduced number of features
                X_train, X_test, Y_train, Y_test = train_test_split(X_extended, Y_extended, test_size = 0.2, stratify = Y_extended)
                
        ## scale input variables to 0-1 scale
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.fit_transform(X_test)
        
        update_results(X_train, X_test, Y_train, Y_test, results, combo_number)
        
    results['total_loops'] = loop_number + 1
    ## Writing data to s3
    write_data_to_s3(data_file_name, results)


In [22]:
results = results.drop(results[(results['extended_data'] == 'Y') & (results['input_layer'] == 8)].index)

In [23]:
loops_run = results.at[1, 'total_loops']

score_2_f1 = pd.DataFrame(results['0.2_f1'].sort_values(ascending = [False]) / loops_run).reset_index()
score_25_f1 = pd.DataFrame(results['0.25_f1'].sort_values(ascending = [False]) / loops_run).reset_index()
score_3_f1 = pd.DataFrame(results['0.3_f1'].sort_values(ascending = [False]) / loops_run).reset_index()
score_35_f1 = pd.DataFrame(results['0.35_f1'].sort_values(ascending = [False]) / loops_run).reset_index()
score_4_f1 = pd.DataFrame(results['0.4_f1'].sort_values(ascending = [False]) / loops_run).reset_index()
score_45_f1 = pd.DataFrame(results['0.45_f1'].sort_values(ascending = [False]) / loops_run).reset_index()
score_5_f1 = pd.DataFrame(results['0.5_f1'].sort_values(ascending = [False]) / loops_run).reset_index()

all_f1_scores = pd.concat([score_2_f1, score_25_f1, score_3_f1, score_35_f1, score_4_f1, score_45_f1, score_5_f1], axis = 1)
all_f1_scores = all_f1_scores
all_f1_scores

Unnamed: 0,index,0.2_f1,index.1,0.25_f1,index.2,0.3_f1,index.3,0.35_f1,index.4,0.4_f1,index.5,0.45_f1,index.6,0.5_f1
0,53,0.503686,16,0.502438,39,0.497604,39,0.484405,3,0.442011,0,0.307056,102,0.186533
1,20,0.503683,4,0.502213,3,0.493571,57,0.483656,39,0.434485,24,0.297437,0,0.185379
2,16,0.503532,39,0.501544,21,0.493405,3,0.480046,31,0.433801,102,0.290820,24,0.180145
3,14,0.503526,20,0.501452,22,0.492306,58,0.476445,57,0.433126,84,0.290424,18,0.176986
4,26,0.503494,53,0.501002,45,0.492275,111,0.476319,111,0.432711,103,0.290160,156,0.173702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,120,0.489312,73,0.475981,72,0.451142,158,0.404111,107,0.290950,17,0.108869,5,0.041271
158,78,0.489058,84,0.475690,84,0.450342,48,0.401903,71,0.285918,15,0.107417,111,0.040611
159,84,0.488528,114,0.475304,108,0.445780,122,0.401125,122,0.284165,5,0.098688,153,0.039448
160,91,0.487887,90,0.474857,132,0.444659,132,0.391399,125,0.280093,11,0.087084,81,0.037979


## All models are not better than Logistic models