In [2]:

from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score,make_scorer,f1_score
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [3]:
df=pd.read_csv("heart.csv")
X=df.iloc[:,:11]
y=df.iloc[:,11]
X

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat


In [4]:
one_hot_variables=["Sex","ChestPainType","RestingECG","ExerciseAngina","ST_Slope"]
X=pd.get_dummies(data=X,prefix=one_hot_variables,columns=one_hot_variables)
X.shape

(918, 20)

In [5]:
X_norm=StandardScaler().fit_transform(X)
X_train,X_test,y_train,y_test=train_test_split(X_norm,y,train_size=0.8,random_state=42)

In [411]:

param_grid = {
    "min_child_weight": [2, 10, 30, 50, 100, 200, 300, 700], # too low-> overfitting ,too much->underfitting
    "max_depth": [1, 2, 3, 4, 8, 16, 32, 64, None], # too much-> overfitting ,too low-> underfitting
    "n_estimators": [10, 50, 100, 500,750,1000],
    "subsample": [0.6, 0.8, 1.0],  # how much the train data that is used in the training phase (reduce overfitting)
    "colsample_bytree": [0.4, 0.6, 0.8, 1.0], # how much coulmns that is used (reduce overfitting)
    "alpha": [0, 0.01, 0.1, 1, 10],  # L1 regularization
    "lambda": [0.01, 0.1, 1, 10, 100],  # L2 regularization
    "learning_rate":[0.001,0.01,0.1,1,10]
}
accuracy=make_scorer(accuracy_score)
model=XGBClassifier(verbosity = 1,random_state=42,eval_metric='logloss',early_stopping_rounds=10)
Gs=RandomizedSearchCV(estimator=model,param_distributions=param_grid,n_jobs=-1,n_iter=1000,cv=5,scoring=accuracy).fit(X_train,y_train,eval_set=[(X_test,y_test)],verbose=False)

In [412]:

yhat_train=Gs.predict(X_train)
yhat_test=Gs.predict(X_test)
yhat_train_accuracy=accuracy_score(y_train,yhat_train)
yhat_test_accuracy=accuracy_score(y_test,yhat_test)
yhat_train_f1score=f1_score(y_train,yhat_train)
yhat_test_f1score=f1_score(y_test,yhat_test)
print(f"train accuracy={yhat_train_accuracy:0.2f}")
print(f"test accuracy={yhat_test_accuracy:0.2f}")
print(f"train f1_score={yhat_train_f1score:0.2f}")
print(f"test f1_score={yhat_test_f1score:0.2f}")
print("------------------------------------------------------")
print(Gs.best_params_)



train accuracy=0.94
test accuracy=0.88
train f1_score=0.95
test f1_score=0.90
------------------------------------------------------
{'subsample': 0.6, 'n_estimators': 750, 'min_child_weight': 2, 'max_depth': 8, 'learning_rate': 0.01, 'lambda': 1, 'colsample_bytree': 0.8, 'alpha': 0}


In [None]:
#1:{'subsample': 0.6, 'n_estimators': 1000, 'min_child_weight': 2, 'max_depth': 32, 'learning_rate': 0.1, 'lambda': 10, 'colsample_bytree': 1.0, 'alpha': 0},0.91,0.90


In [15]:


model=XGBClassifier(max_depth=32,min_child_weight=2,n_estimators=1000,subsample=0.6,colsample_bytree=1.0,reg_lambda=10,reg_alpha=0,random_state=42,learning_rate=0.1,verbosity=1,early_stopping_rounds=10,eval_metric="logloss").fit(X_train,y_train,eval_set=[(X_test,y_test)],verbose=False)
yhat_train=model.predict(X_train)
yhat_test=model.predict(X_test)
yhat_train_accuracy=accuracy_score(y_train,yhat_train)
yhat_test_accuracy=accuracy_score(y_test,yhat_test)
yhat_train_f1score=f1_score(y_train,yhat_train)
yhat_test_f1score=f1_score(y_test,yhat_test)
print(f"train accuracy={yhat_train_accuracy*100:0.2f}%")
print(f"test accuracy={yhat_test_accuracy*100:0.2f}%")
print(f"train f1_score={yhat_train_f1score*100:0.2f}%")
print(f"test f1_score={yhat_test_f1score*100:0.2f}%")    

train accuracy=91.28%
test accuracy=89.67%
train f1_score=92.16%
test f1_score=91.08%
