In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"
]

In [3]:
DATA_DIR = "data"

In [4]:
import os

In [5]:
cleveland_data = pd.read_csv(os.path.join(DATA_DIR, "cleveland.data"), names=columns)

In [6]:
cleveland_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [7]:
categorical = ["sex", "cp", "fbs", "restecg", "exang", "slope", "num"]

In [8]:
len(categorical)

7

In [9]:
cleveland_data["ca"] = cleveland_data["ca"].replace('?', np.nan)  # replace '?' with nan

In [10]:
cleveland_data["ca"] = pd.to_numeric(cleveland_data["ca"])  # From string values to float values

In [11]:
cleveland_data["ca"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 303 entries, 0 to 302
Series name: ca
Non-Null Count  Dtype  
--------------  -----  
299 non-null    float64
dtypes: float64(1)
memory usage: 2.5 KB


In [12]:
cleveland_data["thal"] = cleveland_data["thal"].replace('?', np.nan)  # replace '?' with nan

In [13]:
cleveland_data["thal"] = pd.to_numeric(cleveland_data["thal"])  # From string values to float values

In [14]:
cleveland_data["thal"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 303 entries, 0 to 302
Series name: thal
Non-Null Count  Dtype  
--------------  -----  
301 non-null    float64
dtypes: float64(1)
memory usage: 2.5 KB


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [16]:
cleveland_data = cleveland_data.dropna()

In [17]:
cleveland_data[categorical] = cleveland_data[categorical].astype("int32")

In [18]:
cleveland_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    float64
 1   sex       297 non-null    int32  
 2   cp        297 non-null    int32  
 3   trestbps  297 non-null    float64
 4   chol      297 non-null    float64
 5   fbs       297 non-null    int32  
 6   restecg   297 non-null    int32  
 7   thalach   297 non-null    float64
 8   exang     297 non-null    int32  
 9   oldpeak   297 non-null    float64
 10  slope     297 non-null    int32  
 11  ca        297 non-null    float64
 12  thal      297 non-null    float64
 13  num       297 non-null    int32  
dtypes: float64(7), int32(7)
memory usage: 26.7 KB


In [19]:
X = cleveland_data.drop(["num"], axis=1)

In [20]:
y = cleveland_data[["num"]]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [22]:
tree_classifier = DecisionTreeClassifier(max_depth=10)

In [23]:
tree_classifier.fit(X_train, y_train)

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
y_pred = tree_classifier.predict(X_test)

In [26]:
print("Tree score", accuracy_score(y_test, y_pred))

Tree score 0.4666666666666667


In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
forest_classifier = RandomForestClassifier(
    n_estimators=100,
    max_leaf_nodes=12,
    n_jobs=-1,
    random_state=42
)

In [29]:
forest_classifier.fit(X_train, y_train.values.flatten())

In [30]:
y_pred = forest_classifier.predict(X_test)

In [31]:
print("Random forest score", accuracy_score(y_test, y_pred))

Random forest score 0.5333333333333333


In [32]:
n_estimators = [int(x) for x in np.linspace(start=10, stop=250, num=10)]

In [33]:
max_features = ["sqrt", "log2"]

In [34]:
max_leaf_nodes = [None, 6, 12]

In [35]:
max_depth = [None, 4, 8]

In [36]:
bootstrap = [True, False]

In [37]:
param_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_leaf_nodes": max_leaf_nodes,
    "max_depth": max_depth,
    "bootstrap": bootstrap
}

In [38]:
from sklearn.model_selection import GridSearchCV

In [39]:
forest_grid = GridSearchCV(
    estimator=forest_classifier,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1
)

In [41]:
forest_grid.fit(X_train, y_train.values.flatten())

Fitting 3 folds for each of 360 candidates, totalling 1080 fits


In [42]:
forest_grid.best_params_

{'bootstrap': True,
 'max_depth': 4,
 'max_features': 'sqrt',
 'max_leaf_nodes': 12,
 'n_estimators': 36}

In [43]:
print("Train accuracy", forest_grid.score(X_train, y_train))

Train accuracy 0.7078651685393258


In [44]:
print("Test accuracy", forest_grid.score(X_train, y_train))

Test accuracy 0.7078651685393258
