In [3]:
import pandas as pd
import numpy as np
import boto3
import botocore
from datetime import datetime
from sklearn.model_selection import train_test_split

In [6]:
#import data from s3 bucket
BUCKET_NAME = 'd1namo' 
KEY = 'patient2_combined_data.csv' 

s3 = boto3.resource('s3')

try:
    s3.Bucket(BUCKET_NAME).download_file(KEY, 'patient2_combined_data.csv')

except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The object does not exist.")
    else:
        raise

In [5]:
#read data
df = pd.read_csv('patient2_combined_data.csv', index_col = [0])

In [6]:
#data preprocessesing
df_ready = df.drop(['join', 'date_join', 'Time', 'ttime', 'glucose', 'type', 'time', 'time.1', 'date'], axis = 1)
df_ready.head()

Unnamed: 0,hr,br,skintemp,posture,activity,peakaccel,batteryvolts,batterylevel,bramplitude,brnoise,...,devicetemp,statusinfo,linkquality,rssi,txpower,coretemp,auxadc1,auxadc2,auxadc3,hypoglycemia
0,82,26.25,-3276.8,-5,0.11,0.21,3.935,55,48.0,65535.0,...,30.5,512,255,-128,-128,37.5,414,419,498,0
1,86,17.5,-3276.8,-7,0.055,0.13,3.932,55,46.0,65535.0,...,30.5,528,255,-128,-128,37.5,414,419,498,1
2,85,24.15,-3276.8,-42,0.03,0.075,3.931,55,42.5,65535.0,...,30.5,512,255,-128,-128,37.4,414,419,498,1
3,90,23.8,-3276.8,-53,0.01,0.03,3.93,55,28.0,65535.0,...,30.8,512,255,-128,-128,37.4,414,419,498,1
4,60,20.75,-3276.8,-18,0.02,0.06,3.929,54,39.0,65535.0,...,31.1,528,255,-128,-128,37.4,414,419,498,1


In [7]:
#split dataset into train and test
X = df_ready.drop('hypoglycemia', axis=1)
y = df_ready['hypoglycemia']

X_train, X_test, y_train, y_test = train_test_split(X , y, stratify = y, test_size=0.2, random_state=0)

# Show the Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

Shape of training feature: (387, 34)
Shape of testing feature: (97, 34)
Shape of training label: (387,)
Shape of training label: (97,)


In [8]:
def evaluate_model(model, x_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

    # Calculate area under curve (AUC)
    y_pred_proba = model.predict_proba(x_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm}

In [26]:
from sklearn.ensemble import RandomForestClassifier

# Building Random Forest model 
rf = RandomForestClassifier(random_state=311)
rf.fit(X_train, y_train)

# Evaluate Model
rf_eval = evaluate_model(rf, X_test, y_test)

# Print result
print('Accuracy:', rf_eval['acc'])
print('Precision:', rf_eval['prec'])
print('Recall:', rf_eval['rec'])
print('F1 Score:', rf_eval['f1'])
print('Cohens Kappa Score:', rf_eval['kappa'])
print('Area Under Curve:', rf_eval['auc'])
print('Confusion Matrix:\n', rf_eval['cm'])
rf_eval = evaluate_model(rf, X_test, y_test)


Accuracy: 0.8865979381443299
Precision: 0.8
Recall: 0.6
F1 Score: 0.6857142857142857
Cohens Kappa Score: 0.6182468694096601
Area Under Curve: 0.9616883116883117
Confusion Matrix:
 [[74  3]
 [ 8 12]]


In [36]:
from sklearn.neighbors import KNeighborsClassifier

# Building KNN model 
knn = KNeighborsClassifier(10)
knn.fit(X_train, y_train)

# Evaluate Model
knn_eval = evaluate_model(knn, X_test, y_test)

# Print result
print('Accuracy:', knn_eval['acc'])
print('Precision:', knn_eval['prec'])
print('Recall:', knn_eval['rec'])
print('F1 Score:', knn_eval['f1'])
print('Cohens Kappa Score:', knn_eval['kappa'])
print('Area Under Curve:', knn_eval['auc'])
print('Confusion Matrix:\n', knn_eval['cm'])

Accuracy: 0.845360824742268
Precision: 0.8571428571428571
Recall: 0.3
F1 Score: 0.4444444444444444
Cohens Kappa Score: 0.37793929029499795
Area Under Curve: 0.6461038961038961
Confusion Matrix:
 [[76  1]
 [14  6]]


In [27]:
from sklearn import tree

# Building Decision Tree model 
dtc = tree.DecisionTreeClassifier(random_state=311)
dtc.fit(X_train, y_train)

# Evaluate Model
dtc_eval = evaluate_model(dtc, X_test, y_test)

# Print result
print('Accuracy:', dtc_eval['acc'])
print('Precision:', dtc_eval['prec'])
print('Recall:', dtc_eval['rec'])
print('F1 Score:', dtc_eval['f1'])
print('Cohens Kappa Score:', dtc_eval['kappa'])
print('Area Under Curve:', dtc_eval['auc'])
print('Confusion Matrix:\n', dtc_eval['cm'])

Accuracy: 0.9175257731958762
Precision: 0.8
Recall: 0.8
F1 Score: 0.8000000000000002
Cohens Kappa Score: 0.7480519480519481
Area Under Curve: 0.874025974025974
Confusion Matrix:
 [[73  4]
 [ 4 16]]


In [31]:
dtc_score = [dtc_eval['acc'], dtc_eval['prec'], dtc_eval['rec'], dtc_eval['f1']]
rf_score = [rf_eval['acc'], rf_eval['prec'], rf_eval['rec'], rf_eval['f1']]
knn_score = [knn_eval['acc'], knn_eval['prec'], knn_eval['rec'], knn_eval['f1']]

In [32]:
df_index = ['decision_tree', 'random_forest', 'KNN']
metrics_df = pd.DataFrame(data=[dtc_score, rf_score, knn_score], index=df_index, columns=['accuracy', 'precision', 'recall', 'f1_score'])
metrics_df

Unnamed: 0,accuracy,precision,recall,f1_score
decision_tree,0.917526,0.8,0.8,0.8
random_forest,0.886598,0.8,0.6,0.685714
KNN,0.845361,0.857143,0.3,0.444444


In [33]:
date = datetime.now().strftime("%Y_%m_%d-%I:%M:%S_%p")

metrics_df.to_csv('model_metrics_{0}.csv'.format(date))

In [34]:
# !pip install psycopg2-binary
from sqlalchemy import *
import psycopg2


user = 'postgres'
password = '______'
host = 'datawarehouse1.cqjhpkzziyj1.us-east-1.rds.amazonaws.com'
port = 5432
database = 'postgres'


engine = create_engine("postgresql://{0}:{1}@{2}:{3}/{4}".format(user, password, host, port, database))

metrics_df.to_sql(con= engine, schema = 'model_metrics', name= 'model_metrics_{0}'.format(date), if_exists= 'replace')

