In [25]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

import requests
from io import StringIO

from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
import requests
url = "https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day65-random-forest/heart.csv"
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0"}
req = requests.get(url, headers=headers)
data = StringIO(req.text)

df1 = pd.read_csv(data)
df1.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
df1.to_csv('/home/saad/Downloads/heart.csv', index=False)

In [4]:
df = pd.read_csv('images/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
df.shape

(303, 14)

In [6]:
X = df.iloc[:, 0:-1]
y = df.iloc[:,-1]

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
print(X_train.shape)
print(X_test.shape)

(242, 13)
(61, 13)


In [9]:
rf = RandomForestClassifier(max_samples=0.75,random_state=42)
gb = GradientBoostingClassifier()
svc = SVC()
lr = LogisticRegression()

In [10]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy for  Random_Forest will be: {} " .format(accuracy_score(y_test, y_pred)))

Accuracy for  Random_Forest will be: 0.9016393442622951 


In [11]:
gb.fit(X_train, y_train)
y_pred1 = gb.predict(X_test)
print("Accuracy for Gradient_Boosting will be: {} " .format(accuracy_score(y_test, y_pred1)))

Accuracy for Gradient_Boosting will be: 0.7704918032786885 


In [12]:
svc.fit(X_train, y_train)
y_pred2 = svc.predict(X_test)
print("Accuracy for SVC will be: {} " .format(accuracy_score(y_test, y_pred2)))

Accuracy for SVC will be: 0.7049180327868853 


In [13]:
lr.fit(X_train, y_train)
y_pred3 = lr.predict(X_test)
print("Accuracy for Logistic_Regression will be: {} " .format(accuracy_score(y_test, y_pred3)))

Accuracy for Logistic_Regression will be: 0.8852459016393442 


In [14]:
print('CV for Random_Forest: ',np.mean(cross_val_score(RandomForestClassifier(max_samples=0.50,random_state=42),X,y,cv=10,scoring='accuracy')))
print()
print('CV for Gradient_Boosting: ',np.mean(cross_val_score(GradientBoostingClassifier(),X,y,cv=10,scoring='accuracy')))
print()
print('CV for SVC: ',np.mean(cross_val_score(SVC(),X,y,cv=10,scoring='accuracy')))
print()
print('CV for Logistic_Regression: ',np.mean(cross_val_score(LogisticRegression(),X,y,cv=10,scoring='accuracy')))

CV for Random_Forest:  0.8413978494623656

CV for Gradient_Boosting:  0.7981720430107526

CV for SVC:  0.6604301075268817

CV for Logistic_Regression:  0.8283870967741935


### ***Hyperparameter tuning using Gridsearchcv and RandomizedSearchCV***

## GridSearchCV

In [15]:
# number of trees in random forest
n_estimators = [20, 60, 100, 120]

# number of features to consider at every split
max_features = [0.2, 0.6, 1.0]

# maximum number of levels in tree
max_depth = [2,8,None]

# number of samples
max_samples = [0.5,0.75,1.0]

# 108 diff random forest train (4*3*3*3)

In [16]:
param_grid = {'n_estimators':n_estimators,
                'max_features':max_features,
                'max_depth':max_depth,
                'max_samples':max_samples
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [17]:
rf=RandomForestClassifier()

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
rf_grid = GridSearchCV(estimator=rf,
                      param_grid=param_grid,
                      cv=5,
                      verbose=2, # show the output during the process
                      n_jobs=-1) # divide the work into every processor (similary make the processing faster)

In [20]:
rf_grid.fit(X_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [2, 8, None],
                         'max_features': [0.2, 0.6, 1.0],
                         'max_samples': [0.5, 0.75, 1.0],
                         'n_estimators': [20, 60, 100, 120]},
             verbose=2)

In [21]:
rf_grid.best_params_ # best parameters

{'max_depth': None,
 'max_features': 0.2,
 'max_samples': 0.5,
 'n_estimators': 20}

In [22]:
rf_grid.best_score_ # best accuracy

0.851530612244898

## RandomSearchCV

In [23]:
# number of trees in random forest
n_estimators = [20, 60, 100, 120]

# number of features to consider at every split
max_features = [0.2, 0.6, 1.0]

# maximum number of levels in tree
max_depth = [2,8,None]

# number of samples
max_samples = [0.5,0.75,1.0]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2,5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2]

In [24]:
param_grid = {'n_estimators':n_estimators,
                'max_features':max_features,
                'max_depth':max_depth,
                'max_samples':max_samples,
                'bootstrap':bootstrap,
                'min_samples_split':min_samples_split,
                'min_samples_leaf':min_samples_leaf
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [26]:
from sklearn.model_selection import RandomizedSearchCV

In [28]:
rf_grid = RandomizedSearchCV(estimator=rf,
                      param_distributions=param_grid,
                      cv=5,
                      verbose=2, # show the output during the process
                      n_jobs=-1) # divide the work into every processor (similary make the processing faster)

In [29]:
rf_grid.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 8, None],
                                        'max_features': [0.2, 0.6, 1.0],
                                        'max_samples': [0.5, 0.75, 1.0],
                                        'min_samples_leaf': [1, 2],
                                        'min_samples_split': [2, 5],
                                        'n_estimators': [20, 60, 100, 120]},
                   verbose=2)

In [30]:
rf_grid.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_samples': 0.75,
 'max_features': 0.2,
 'max_depth': None,
 'bootstrap': False}

In [31]:
rf_grid.best_score_

0.8098639455782312

##### It select the parameters value randomely and applying the task on them that's why it is faster than GridseachCV but it not gives so much best accuracy in comparision to GridseachCV.But when we have very large dataset then we can use this approache for hyperparameter tuning.