In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import *
from sklearn.metrics import *
import warnings
warnings.filterwarnings("ignore")

In [81]:
df = pd.read_table("data/heart.dat",sep=" ",header=None)
df.columns = ["age","sex","chest_pain_type","resting_blood_pressure","serum_cholesterol_mg_per_dl","fasting_blood_sugar_gt_120_mg_per_dl","resting_ekg_results","max_heart_rate_achieved","exercise_induced_angina","oldpeak_eq_st_depression","slope_of_peak_exercise_st_segment","num_major_vessels","thal","heart_disease_present"]
df["heart_disease_present"] = df["heart_disease_present"] - 1
df

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholesterol_mg_per_dl,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,max_heart_rate_achieved,exercise_induced_angina,oldpeak_eq_st_depression,slope_of_peak_exercise_st_segment,num_major_vessels,thal,heart_disease_present
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,1
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,0
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,1
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,0
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,0
266,44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,0
267,56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0
268,57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,0


In [82]:
pp = df["heart_disease_present"].value_counts().reset_index()
pp["percent"] = pp["count"] * 100 / pp["count"].sum()
rr = pd.DataFrame(columns=["train","test"])

rr["train"] = ((pp["percent"] / 100) * (270 * (75/100)))
rr["train"] = rr["train"].round(0)

rr["test"] = ((pp["percent"] / 100) * (270 * (25/100)))
rr["test"] = rr["test"].round(0)

rr = rr.astype("int64")

rr

Unnamed: 0,train,test
0,112,38
1,90,30


In [83]:
df_shuffled = df.sample(frac=1,random_state=np.random.randint(0,1000))

no_heart_disease_indices = df_shuffled[df_shuffled["heart_disease_present"]==0].index.values

yes_heart_disease_indices = df_shuffled[df_shuffled["heart_disease_present"]==1].index.values

In [84]:
training_indices = np.append(no_heart_disease_indices[0:int(round((len(no_heart_disease_indices) * 75 / 100),0))],
yes_heart_disease_indices[0:int(round((len(yes_heart_disease_indices) * 75 / 100),0))])

testing_indices = np.append(no_heart_disease_indices[int(round((len(no_heart_disease_indices) * 75 / 100),0)):],
          yes_heart_disease_indices[int(round((len(yes_heart_disease_indices) * 75 / 100),0)):])

In [85]:
df_train = df_shuffled.loc[training_indices,:].reset_index(drop=True)
df_test = df_shuffled.loc[testing_indices,:].reset_index(drop=True)

In [86]:
X_train = df_train.drop(columns=["heart_disease_present"])
X_test = df_test.drop(columns=["heart_disease_present"])
y_train = df_train["heart_disease_present"]
y_test = df_test["heart_disease_present"]

In [87]:
log_regr = LogisticRegression()
log_regr.fit(X_train, y_train)

In [90]:
y_pred = log_regr.predict(X_test)
y_pred_proba = log_regr.predict_proba(X_test)[:,1]

In [91]:
y_pred_proba

array([0.12809539, 0.02868805, 0.15681275, 0.26874456, 0.11417449,
       0.13571652, 0.05419833, 0.25700063, 0.03453623, 0.01089602,
       0.904547  , 0.54796757, 0.48820464, 0.94798217, 0.05649191,
       0.51045485, 0.26787822, 0.0909929 , 0.03734958, 0.02741615,
       0.01699642, 0.11748339, 0.09355117, 0.04271514, 0.0976964 ,
       0.16586733, 0.18707678, 0.2169146 , 0.04291946, 0.20072557,
       0.22677665, 0.18570354, 0.27671802, 0.71585453, 0.04435748,
       0.02700062, 0.16607379, 0.0921907 , 0.98540251, 0.78699198,
       0.06142669, 0.70790436, 0.96686367, 0.77796407, 0.93748139,
       0.76100101, 0.78737857, 0.97345219, 0.96410232, 0.99548864,
       0.97859387, 0.98961307, 0.86999839, 0.1402332 , 0.89502147,
       0.6856609 , 0.30081971, 0.91658962, 0.61435124, 0.97467698,
       0.91596489, 0.99531733, 0.12901057, 0.19650317, 0.96889435,
       0.89496471, 0.42322577, 0.78702851])

In [99]:
logloss = log_loss(y_test, y_pred_proba)
logloss

0.3936415344936038