# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("heart.csv.xls")

In [5]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [7]:
df.shape

(303, 14)

In [39]:
df.isna().sum()    # No NULL values

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [41]:
df["target"].unique()

array([1, 0], dtype=int64)

In [42]:
df["target"].value_counts()

target
1    165
0    138
Name: count, dtype: int64

# Separating Independent and Dependent Columns

In [53]:
x = df.drop(columns=["target"])
y = df[["target"]]

In [54]:
x

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [55]:
y

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1
...,...
298,0
299,0
300,0
301,0


# Train Test Split

In [56]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)

In [62]:
print(x_train.shape[0])
print(x_test.shape[0])

212
91


# Random Forest Classifier

In [63]:
from sklearn.ensemble import RandomForestClassifier

In [64]:
rf = RandomForestClassifier()

In [65]:
# Model Training

rf.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [66]:
# Prediction

ypred = rf.predict(x_test)
ypred

array([1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1], dtype=int64)

# Model Evaluation

In [67]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [68]:
acc = accuracy_score(ypred, y_test)
print("Accuracy:", acc*100)

Accuracy: 87.91208791208791


In [69]:
conf_matrix = confusion_matrix(ypred, y_test)
conf_matrix

array([[33,  3],
       [ 8, 47]], dtype=int64)

In [71]:
clf_report = classification_report(ypred, y_test)
print(clf_report)

              precision    recall  f1-score   support

           0       0.80      0.92      0.86        36
           1       0.94      0.85      0.90        55

    accuracy                           0.88        91
   macro avg       0.87      0.89      0.88        91
weighted avg       0.89      0.88      0.88        91



# Hyperparameter Tuning

### GridSearchCV

In [103]:
# giving parameters

# Function to measure Quality is split    (default=Gini)
criterion = ["gini", "entropy"]                                       # 2-values

# Number of Trees in Random Forest
n_estimators = [20,60,100,120]                                        # 4-values

# Number of Features(columns) to consider at every split
max_features = ["auto","sqrt", "log2", None]                          # 4-values

# Maximum no. of Leaves in the trees or Layers 
max_depth = [2,4,6,8, None]                                           # 5-values

# No.of Samples
max_samples = [2,4,6,8]                                         # 4-values



# 2*4*4*5*4 = 640, So it will TRAIN 640 different Random Forests.

In [104]:
parameters = {
    'criterion': criterion,
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_samples': max_samples
}

In [105]:
rf_model = RandomForestClassifier()

In [106]:
from sklearn.model_selection import GridSearchCV

rf_gscv = GridSearchCV(
                  estimator = rf_model,   # estimator = Konse Algorithm ko run krna hai
                  param_grid = parameters,
                  cv = 5,
                  verbose = 2,    # it will show the outputs during the process
                  n_jobs = -1     # The no. of jobs to run in parallel 
                  )

In [107]:
rf_gscv.fit(x_train, y_train)

Fitting 5 folds for each of 640 candidates, totalling 3200 fits


800 fits failed out of a total of 3200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
669 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
s

In [108]:
rf_gscv.best_params_  # shows which are the Best parameters choosen by the gscv

{'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 'log2',
 'max_samples': 8,
 'n_estimators': 120}

In [109]:
rf_gscv.best_score_   # shows the Best score

0.8352159468438538

### RandomizedSearchCV

In [127]:
# giving parameters

# Function to measure Quality is split    (default=Gini)
criterion = ["gini", "entropy"]                                       # 2-values

# Number of Trees in Random Forest
n_estimators = [20,60,100,120]                                        # 4-values

# Number of Features(columns) to consider at every split
max_features = ["auto","sqrt", "log2", None]                          # 4-values

# Maximum no. of Leaves in the trees or Layers 
max_depth = [2,4,6,8, None]                                           # 5-values

# No.of Samples
max_samples = [2,4,6,8]                                               # 4-values

# Minimum number of samples required to split an internal node
min_samples_split = [2,5]                                             # 2-values

# Minimum number of samples required to be at a leaf node.
min_samples_leaf = [1,2]                                              # 2-values

# Bootstrap Samples
bootstrap =  [True,False]                                             # 2-values

# 2*4*4*5*4*2*2*2 = 5120, So it RANDOMLY chooses various different Random Forests

In [124]:
parameters_rscv = {
    'criterion': criterion,
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_samples': max_samples,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

In [125]:
from sklearn.model_selection import RandomizedSearchCV

rf_rscv = RandomizedSearchCV(
                  estimator = rf_model,   # estimator = Konse Algorithm ko run krna hai
                  param_distributions = parameters_rscv,
                  cv = 5,
                  verbose = 2,    # it will show the outputs during the process
                  n_jobs = -1     # The no. of jobs to run in parallel 
                  )

In [126]:
rf_rscv.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\HP\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklear

In [128]:
rf_rscv.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_samples': 4,
 'max_features': 'log2',
 'max_depth': None,
 'criterion': 'entropy'}

In [129]:
rf_rscv.best_score_

0.7972314507198228