<a href="https://colab.research.google.com/github/aissam-out/Predicting-Heart-Disease/blob/master/ML_algos_multi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import resources

In [0]:
# import modules
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# loading the dataset
downloaded_train_v = drive.CreateFile({'id':'******************'}) 
downloaded_train_l = drive.CreateFile({'id':'******************'}) 

downloaded_train_v.GetContentFile('train_values.csv')
downloaded_train_l.GetContentFile('train_labels.csv')

df_train_v = pd.read_csv("train_values.csv")
df_train_l = pd.read_csv("train_labels.csv")

# Data Exploration & preprocessing

In [0]:
# overview of the whole dataset
data = df_train_v.copy()
data['labels'] = df_train_l['heart_disease_present']
data.sample()

Unnamed: 0,patient_id,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,labels
172,j9tw19,2,reversible_defect,118,4,0,0,0,219,1.2,1,39,140,0,1


In [0]:
data.columns

Index(['patient_id', 'slope_of_peak_exercise_st_segment', 'thal',
       'resting_blood_pressure', 'chest_pain_type', 'num_major_vessels',
       'fasting_blood_sugar_gt_120_mg_per_dl', 'resting_ekg_results',
       'serum_cholesterol_mg_per_dl', 'oldpeak_eq_st_depression', 'sex', 'age',
       'max_heart_rate_achieved', 'exercise_induced_angina', 'labels'],
      dtype='object')

In [0]:
# replace strings with numbers
df_train_v['thal'] = df_train_v['thal'].replace({'normal':0,'reversible_defect':1, 'fixed_defect':2})

# drop ID column
df_train_v = df_train_v.drop("patient_id", axis=1)
df_train_l = df_train_l.drop("patient_id", axis=1)

# convert to values
df_train_v = (df_train_v.values).astype('float32') 
df_train_l = df_train_l.values.astype('float32')

# Training

In [0]:
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [0]:
evaluation = {}
evaluation_param = []

parameters = {
    "loss":["deviance"],
    "learning_rate": [0.02,0.1],
    "min_samples_leaf": [1],
    "max_depth":[5],
    "n_estimators":[100]
    }

def prep_results(model):
  predictions_indabax = model.predict_proba(test_indabax)
  submission = predictions_indabax
  results = pd.DataFrame(submission)
  df_sub_indabax["heart_disease_present"] = results
  
  return df_sub_indabax

def agregation_model():
  model = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10, n_jobs=-1, verbose=3)
  model.fit(x_train, y_train)
  model.best_params_
  evaluation_param.append(model.best_params_)
  print(model.best_params_)
  predictions = model.predict_proba(x_test)[:, 1]
  print("log loss : ", log_loss(y_test, predictions))
  
  return model, predictions

# indabax test

In [0]:
# load the test set
downloaded_test_v = drive.CreateFile({'id':'********************'})
downloaded_test_v.GetContentFile('test_values.csv')

df_test_v = pd.read_csv("test_values.csv")
test_indabax = df_test_v.copy()

# preprocess test set
test_indabax['thal'] = test_indabax['thal'].replace({'normal':0,'reversible_defect':1, 'fixed_defect':2})
test_indabax = test_indabax.drop("patient_id", axis=1)
test_indabax = (test_indabax.values).astype('float32')

# load submission form
sub_form = drive.CreateFile({'id':'*********************'})
sub_form.GetContentFile('submission_format.csv')

df_sub = pd.read_csv("submission_format.csv")
df_sub_indabax = df_sub.copy()

# Save results

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!ls

# Multi tests

# Test 1

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
y_train = np.ravel(y_train)

model_1, predictions_1 = agregation_model()
evaluation.update({"model_1" : log_loss(y_test, predictions_1)})

df_sub_indabax = prep_results(model_1)
df_sub_indabax.to_csv("results_ML_1.csv", header=True, index=False)

%cp results_ML_1.csv gdrive/'My Drive'/'Colab Notebooks'/indabax

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    3.2s finished


{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
log loss :  0.9844838451759874


# Test 2

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
y_train = np.ravel(y_train)

model_2, predictions_2 = agregation_model()
evaluation.update({"model_2" : log_loss(y_test, predictions_2)})

df_sub_indabax_2 = prep_results(model_2)
df_sub_indabax_2.to_csv("results_ML_2.csv", header=True, index=False)

%cp results_ML_2.csv gdrive/'My Drive'/'Colab Notebooks'/indabax

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.7s finished


{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
log loss :  0.6843644041414895


# Test 3

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
y_train = np.ravel(y_train)

model_3, predictions_3 = agregation_model()
evaluation.update({"model_3" : log_loss(y_test, predictions_3)})

df_sub_indabax_3 = prep_results(model_3)
df_sub_indabax_3.to_csv("results_ML_3.csv", header=True, index=False)

%cp results_ML_3.csv gdrive/'My Drive'/'Colab Notebooks'/indabax

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.7s finished


{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
log loss :  0.216397418918331


# Test 4

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
y_train = np.ravel(y_train)

model_4, predictions_4 = agregation_model()
evaluation.update({"model_4" : log_loss(y_test, predictions_4)})

df_sub_indabax_4 = prep_results(model_4)
df_sub_indabax_4.to_csv("results_ML_4.csv", header=True, index=False)

%cp results_ML_4.csv gdrive/'My Drive'/'Colab Notebooks'/indabax

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.6s finished


{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
log loss :  0.7287725126073084


# Test 5

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
y_train = np.ravel(y_train)

model_5, predictions_5 = agregation_model()
evaluation.update({"model_5" : log_loss(y_test, predictions_5)})

df_sub_indabax_5 = prep_results(model_5)
df_sub_indabax_5.to_csv("results_ML_5.csv", header=True, index=False)

%cp results_ML_5.csv gdrive/'My Drive'/'Colab Notebooks'/indabax

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.6s finished


{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
log loss :  1.1726018447789484


# Test 6

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
y_train = np.ravel(y_train)

model_6, predictions_6 = agregation_model()
evaluation.update({"model_6" : log_loss(y_test, predictions_6)})

df_sub_indabax_6 = prep_results(model_6)
df_sub_indabax_6.to_csv("results_ML_6.csv", header=True, index=False)

%cp results_ML_6.csv gdrive/'My Drive'/'Colab Notebooks'/indabax

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.6s finished


{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
log loss :  1.2028140154061047


# Test 7

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
y_train = np.ravel(y_train)

model_7, predictions_7 = agregation_model()
evaluation.update({"model_7" : log_loss(y_test, predictions_7)})

df_sub_indabax_7 = prep_results(model_7)
df_sub_indabax_7.to_csv("results_ML_7.csv", header=True, index=False)

%cp results_ML_7.csv gdrive/'My Drive'/'Colab Notebooks'/indabax

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.7s finished


{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
log loss :  0.7668950557332302


# Test 8

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
y_train = np.ravel(y_train)

model_8, predictions_8 = agregation_model()
evaluation.update({"model_8" : log_loss(y_test, predictions_8)})

df_sub_indabax_8 = prep_results(model_8)
df_sub_indabax_8.to_csv("results_ML_8.csv", header=True, index=False)

%cp results_ML_8.csv gdrive/'My Drive'/'Colab Notebooks'/indabax

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.7s finished


{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
log loss :  0.9982589453282338


# Test 9

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
y_train = np.ravel(y_train)

model_9, predictions_9 = agregation_model()
evaluation.update({"model_9" : log_loss(y_test, predictions_9)})

df_sub_indabax_9 = prep_results(model_9)
df_sub_indabax_9.to_csv("results_ML_9.csv", header=True, index=False)

%cp results_ML_9.csv gdrive/'My Drive'/'Colab Notebooks'/indabax

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.6s finished


{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
log loss :  1.0020376398176976


# Test 10

In [0]:
x_train, x_test, y_train, y_test = train_test_split(df_train_v, df_train_l, test_size=0.2)
y_train = np.ravel(y_train)

model_10, predictions_10 = agregation_model()
evaluation.update({"model_10" : log_loss(y_test, predictions_10)})

df_sub_indabax_10 = prep_results(model_10)
df_sub_indabax_10.to_csv("results_ML_10.csv", header=True, index=False)

%cp results_ML_10.csv gdrive/'My Drive'/'Colab Notebooks'/indabax

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    1.6s finished


{'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 100}
log loss :  0.809247281048007


# Evaluation

In [0]:
import operator
sorted_x = sorted(evaluation.items(), key=operator.itemgetter(1))
sorted_x

[('model_3', 0.216397418918331),
 ('model_2', 0.6843644041414895),
 ('model_4', 0.7287725126073084),
 ('model_7', 0.7668950557332302),
 ('model_10', 0.809247281048007),
 ('model_1', 0.9844838451759874),
 ('model_8', 0.9982589453282338),
 ('model_9', 1.0020376398176976),
 ('model_5', 1.1726018447789484),
 ('model_6', 1.2028140154061047)]

In [0]:
para = pd.DataFrame(evaluation_param)
para.index += 1
para


Unnamed: 0,learning_rate,loss,max_depth,min_samples_leaf,n_estimators
1,0.1,deviance,5,1,100
2,0.1,deviance,5,1,100
3,0.1,deviance,5,1,100
4,0.1,deviance,5,1,100
5,0.1,deviance,5,1,100
6,0.1,deviance,5,1,100
7,0.1,deviance,5,1,100
8,0.1,deviance,5,1,100
9,0.1,deviance,5,1,100
10,0.1,deviance,5,1,100
