In [None]:
import os
import io
import json
import xlrd
import boto3
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
bucket_name, object_key = "machinelearningsamplesyvb", "logisticregression_sampledata.xlsx"
# to make this notebook's output identical at every run
np.random.seed(42)

s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read()
df = pd.read_excel(io.BytesIO(data), encoding='utf-8',sheet_name='data')

df["charges"] = df["charges"] / df["charges"].mean(axis = 0) 
df["steps"] = df["steps"] / df["steps"].mean(axis = 0) 
df["bmi"] = df["bmi"] / df["bmi"].mean(axis = 0) 
agerange = pd.cut(df.age, bins=[0,18,35,50,65], labels=[1,2,3,4])
df.insert(0,"agerange",agerange)
df['agerange'] = pd.factorize(df.agerange)[0]

#sns.pairplot(df, kind="scatter")
#plt.show()
df.corr()

In [None]:
y = df["insuranceclaim"]
#drop columns that are not correlated to insuranceclaim -- age, sex, region
X = df.drop(["age","sex","region","insuranceclaim"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

clf_gb = GradientBoostingClassifier(random_state=0)
clf_gb.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
confusion_matrix = confusion_matrix(y_test, y_pred)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
print('Accuracy of random forest model on test set: {:.2f}'.format(clf.score(X_test, y_test)))
print('Accuracy of gradientboost model on test set: {:.2f}'.format(clf_gb.score(X_test, y_test)))
print(confusion_matrix)
print(classification_report(y_test, y_pred))

In [None]:
#logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
fpr_, tpr_, thresholds_ = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
fpr_gb, tpr_gb, thresholds_gb = roc_curve(y_test, clf_gb.predict_proba(X_test)[:,1])

roc_auc_area = metrics.auc(fpr, tpr)
roc_auc_area_ = metrics.auc(fpr_, tpr_)
roc_auc_area_gb = metrics.auc(fpr_gb, tpr_gb)

plt.figure()

plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_auc_area)
plt.plot(fpr_, tpr_, label='Random Forest (area = %0.2f)' % roc_auc_area_)
plt.plot(fpr_gb, tpr_gb, label='Gradient Boost (area = %0.2f)' % roc_auc_area_gb)

plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

plt.show()