<a href="https://colab.research.google.com/github/aissam-out/Predicting-Heart-Disease/blob/master/template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import resources

In [0]:
# import modules
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# loading the dataset
downloaded_train_v = drive.CreateFile({'id':'*********************'}) 
downloaded_train_l = drive.CreateFile({'id':'*********************'}) 

downloaded_train_v.GetContentFile('train_values.csv')
downloaded_train_l.GetContentFile('train_labels.csv')

df_train_v = pd.read_csv("train_values.csv")
df_train_l = pd.read_csv("train_labels.csv")

# Data Exploration & preprocessing

In [0]:
# overview of the whole dataset
data = df_train_v.copy()
data['labels'] = df_train_l['heart_disease_present']
data.sample()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,labels
150,x4yp0f,1,reversible_defect,108,2,0,0,0,309,0.0,1,54,156,0,0


In [0]:
data.columns

Index(['patient_id', 'slope_of_peak_exercise_st_segment', 'thal',
       'resting_blood_pressure', 'chest_pain_type', 'num_major_vessels',
       'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results',
       'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'sex', 'age',
       'max_heart_rate_achieved', 'exercise_induced_angina', 'labels'],
      dtype='object')

In [0]:
# replace strings with numbers
df_train_v['thal'] = df_train_v['thal'].replace({'normal':0,'reversible_defect':1, 'fixed_defect':2})

# drop ID column
df_train_v = df_train_v.drop("patient_id", axis=1)
df_train_l = df_train_l.drop("patient_id", axis=1)

# convert to values
df_train_v = (df_train_v.values).astype('float32') 
df_train_l = df_train_l.values.astype('float32')

In [0]:
# train test split
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')

# convert column-vector to 1d array
y_train = np.ravel(y_train)

x_train shape: (144, 13)
144 train samples


# Training

In [0]:
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [0]:
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.1],
    "min_samples_leaf": [1],
    "max_depth":[5],
    "n_estimators":[100]
    }

model = GradientBoostingClassifier()
model_grid = GridSearchCV(model, parameters, cv=10, n_jobs=-1, verbose=3)

In [0]:
model_grid = model_grid.fit(x_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.8s finished


In [0]:
model = model_grid.best_estimator_

In [0]:
model.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [0]:
predictions = model.predict_proba(x_test)[:, 1]
log_loss(y_test, predictions)

0.836977607163178

# indabax test

In [0]:
# load the test set
downloaded_test_v = drive.CreateFile({'id':'***********************'})
downloaded_test_v.GetContentFile('test_values.csv')

df_test_v = pd.read_csv("test_values.csv")
test_indabax = df_test_v.copy()

# preprocess test set
test_indabax['thal'] = test_indabax['thal'].replace({'normal':0,'reversible_defect':1, 'fixed_defect':2})
test_indabax = test_indabax.drop("patient_id", axis=1)
test_indabax = (test_indabax.values).astype('float32')

# load submission form
sub_form = drive.CreateFile({'id':'***********************'})
sub_form.GetContentFile('submission_format.csv')

df_sub = pd.read_csv("submission_format.csv")
df_sub_indabax = df_sub.copy()

In [0]:
# Apply the chosen model to the test set
predictions_indabax = model.predict_proba(test_indabax)
predictions_indabax[0:5]

In [0]:
# select "heart_disease_present" column
submission = predictions_indabax[:, 1] #.round(1)
submission[0:5]

In [0]:
# add the results to the submission form
results = pd.DataFrame(submission)
df_sub_indabax["heart_disease_present"] = results
df_sub_indabax.to_csv("results_ML_1.csv", header=True, index=False)

df_sub_indabax.sample()

# Save results

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!ls

In [0]:
%cp results_ML_1.csv gdrive/'My Drive'/'Colab Notebooks'/indabax