
    Introduction: Describes the data and where you got the data. Describe the question being answered and the method(s) being used to answer the question.
    Data pre-processing: What's needed to load the data, clean the data, normalize, etc.
    Model setup: Setup one or more models
    Hyperparameter tuning: Do some playing with the model hyperparameters (learning rate, optimizer, batch size, epochs, whatever makes sense)
    Results: How did the model do
    Discussion: Summarize what worked, what didn't etc.


In [60]:
import pandas as pd
import numpy as np

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
df = pd.read_csv(url, sep=',', header=None)

URLError: <urlopen error EOF occurred in violation of protocol (_ssl.c:997)>

In [46]:
# Preview the first 5 rows
print(df.head())

     0    1    2      3      4    5    6      7    8    9    10   11   12  13
0  63.0  1.0  1.0  145.0  233.0  1.0  2.0  150.0  0.0  2.3  3.0  0.0  6.0   0
1  67.0  1.0  4.0  160.0  286.0  0.0  2.0  108.0  1.0  1.5  2.0  3.0  3.0   2
2  67.0  1.0  4.0  120.0  229.0  0.0  2.0  129.0  1.0  2.6  2.0  2.0  7.0   1
3  37.0  1.0  3.0  130.0  250.0  0.0  0.0  187.0  0.0  3.5  3.0  0.0  3.0   0
4  41.0  0.0  2.0  130.0  204.0  0.0  2.0  172.0  0.0  1.4  1.0  0.0  3.0   0


In [47]:
df.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'] 

In [48]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [None]:
df = pd.to_numeric(df, errors='coerce')
df = df.dropna()

In [49]:
for x in ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']:
    df[x] = df[x].astype('category')

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('num', axis=1), df['num'], test_size=0.2, random_state=5)

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

clf = xgb.XGBClassifier()

param_grid = {
    'max_depth': list(range(10)),
    'learning_rate': np.arange(0.05, 0.5+0.05, 0.05),
    'n_estimators': np.arange(10, 110, 10),
    'gamma': np.arange(0, 0.30, 0.05),
    'objective': ['binary:logistic'],
    'random_state': [5],
    'enable_categorical': [True],
    'tree_method': ['gpu_hist']
}

# Creating the GridSearchCV object
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=2
)

# Fitting the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Printing the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 6000 candidates, totalling 30000 fits
[CV] END enable_categorical=True, gamma=0.0, learning_rate=0.05, max_depth=0, n_estimators=10, objective=binary:logistic, random_state=5, tree_method=gpu_hist; total time=   0.0s
[CV] END enable_categorical=True, gamma=0.0, learning_rate=0.05, max_depth=0, n_estimators=10, objective=binary:logistic, random_state=5, tree_method=gpu_hist; total time=   0.0s
[CV] END enable_categorical=True, gamma=0.0, learning_rate=0.05, max_depth=0, n_estimators=10, objective=binary:logistic, random_state=5, tree_method=gpu_hist; total time=   0.0s
[CV] END enable_categorical=True, gamma=0.0, learning_rate=0.05, max_depth=0, n_estimators=10, objective=binary:logistic, random_state=5, tree_method=gpu_hist; total time=   0.0s
[CV] END enable_categorical=True, gamma=0.0, learning_rate=0.05, max_depth=0, n_estimators=10, objective=binary:logistic, random_state=5, tree_method=gpu_hist; total time=   0.0s
[CV] END enable_categorical=True, gamma