In [1]:
## Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
## Read data
data=pd.read_csv('H:\Full stack data science-Python pratice\Machine Learning\Machine-Learning-Algorithms\Datasets for Algorithms\heart.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Data contains columns like age,sex,few biological parameters and lastly target column is telling whether patient has the heart disease or not.

In [3]:
data.shape

(303, 14)

In [4]:
## Extract X and Y
X=data.iloc[:,:-1]
Y=data.iloc[:,-1]

In [5]:
## Shape of X and Y
print(X.shape,Y.shape)

(303, 13) (303,)


In [6]:
## Train Test Split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(242, 13) (242,)
(61, 13) (61,)


In [9]:
## Random forest 
RF=RandomForestClassifier()

## Fit
RF.fit(X_train,Y_train)

## Predict
Y_Pred=RF.predict(X_test)

## Accuracy
accuracy=accuracy_score(Y_test,Y_Pred)
print("Accuracy=",accuracy)

Accuracy= 0.8524590163934426


In [7]:
## Lets compare the random forest with other algorithms
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

## Create objects of these algorithms
gb=GradientBoostingClassifier()
svc=SVC()
lr=LogisticRegression()

In [8]:
## Gradient Boosting
gb.fit(X_train,Y_train)
y_pred_gb=gb.predict(X_test)
print("Gradient Boosting Accuracy Score:",accuracy_score(Y_test,y_pred_gb))

## SVC
svc.fit(X_train,Y_train)
y_pred_svc=svc.predict(X_test)
print("SVC Accuracy Score:",accuracy_score(Y_test,y_pred_svc))

## Logistic Regression
lr.fit(X_train,Y_train)
y_pred_lr=lr.predict(X_test)
print("Logistic Regression Accuracy Score:",accuracy_score(Y_test,y_pred_lr))

Gradient Boosting Accuracy Score: 0.8032786885245902
SVC Accuracy Score: 0.7377049180327869
Logistic Regression Accuracy Score: 0.8852459016393442


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


With this we can see random forest without any tuning also usually performs well.

In [10]:
## Let's tune random forest with max_samples parameter
rf=RandomForestClassifier(max_samples=0.70,random_state=42)
rf.fit(X_train,Y_train)
y_pred=rf.predict(X_test)
print("Random Forest Accuracy Score=",accuracy_score(Y_test,y_pred))

Random Forest Accuracy Score= 0.8688524590163934


In [11]:
## Cross Validation
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(RandomForestClassifier(),X,Y,cv=10,scoring='accuracy'))

0.8381720430107527

After cross validation, accuracy score will decrease.

### GridSearch CV

In [13]:
## Number of tress in random forest
n_estimators=[20,40,60,100,120]

## Number of features to consider at every split
max_features=[0.2,0.6,1.0]

## Maximum number of levels in a tree
max_depth=[2,8,None]

## Number of samples
max_samples=[0.5,0.75,1.0]


With all the above parameter choices total 108 different combination of parameters will form 108 different random forests.

In [14]:
## Form Parameter Grid
param_grid={'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth':max_depth,
            'max_samples':max_samples}
print(param_grid)

{'n_estimators': [20, 40, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [15]:
rf=RandomForestClassifier()

In [16]:
## Grid Search CV
from sklearn.model_selection import GridSearchCV

rf_grid=GridSearchCV(estimator=rf,
                     param_grid=param_grid,
                     cv=5,
                     verbose=2,
                     n_jobs=-1)

In [17]:
## Fit the grid
rf_grid.fit(X_train,Y_train)

Fitting 5 folds for each of 135 candidates, totalling 675 fits


In [18]:
## Best parameters
rf_grid.best_params_

{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 120}

In [19]:
## Best Score
rf_grid.best_score_

0.8346938775510205

When we have even more parameters to tune and a huge dataset then gridSearchCV becomes slow. So in that case randomSearchCV is used. So for examples,if we have 108 combinations of random forests then it will pick up few of them and select the best among them.

In [20]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [21]:
param_grid={'n_estimators':n_estimators,
            'max_features':max_features,
            'max_depth':max_depth,
            'max_samples':max_samples,
            'bootstrap':bootstrap,
            'min_samples_split':min_samples_split,
            'min_samples_leaf':min_samples_leaf}
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [22]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid=RandomizedSearchCV(estimator=rf,
                           param_distributions=param_grid,
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

In [23]:
#Fit
rf_grid.fit(X_train,Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


30 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\HP\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\HP\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\HP\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 402, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=

In [24]:
rf_grid.best_params_

{'n_estimators': 20,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_samples': 1.0,
 'max_features': 0.2,
 'max_depth': 2,
 'bootstrap': True}

In [25]:
rf_grid.best_score_

0.8140306122448979

With Randomized Search CV we may or may not get the accurate results but we definitely get fast results and that too are not much bad then the gridSearchCv.