In [None]:
#removing all the warnings

import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np
data=pd.read_csv("/content/Pima Indian Diabetes Data Set CSV File.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
#whereever we have glucose value as 0 we replace it with median value else let it be glucose

data['Glucose']=np.where(data['Glucose']==0,data['Glucose'].median(),data['Glucose'])
data['Insulin']=np.where(data['Insulin']==0,data["Insulin"].median(),data['Insulin'])
data['SkinThickness']=np.where(data['SkinThickness']==0,data["SkinThickness"].median(),data['SkinThickness'])

In [None]:
#Selecting independent and dependent features

x=data.drop(columns="Outcome",axis=1)
y=data["Outcome"]

In [None]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33


In [None]:
#Train Test Split

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=33)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=10).fit(x_train,y_train)
#n_estimators is usually 100 represent number of trees
predict=rfc.predict(x_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(confusion_matrix(y_test,predict))
print(classification_report(y_test,predict))
print(accuracy_score(y_test,predict))

[[87 12]
 [26 29]]
              precision    recall  f1-score   support

           0       0.77      0.88      0.82        99
           1       0.71      0.53      0.60        55

    accuracy                           0.75       154
   macro avg       0.74      0.70      0.71       154
weighted avg       0.75      0.75      0.74       154

0.7532467532467533


In [None]:
#Manual Hyperparameter Tuning

model=RandomForestClassifier(n_estimators=300,criterion='entropy',max_features='sqrt',min_samples_leaf=10,random_state=100).fit(x_train,y_train)
predicts=model.predict(x_test)
print(confusion_matrix(y_test,predicts))
print(classification_report(y_test,predicts))
print(accuracy_score(y_test,predicts))

[[87 12]
 [28 27]]
              precision    recall  f1-score   support

           0       0.76      0.88      0.81        99
           1       0.69      0.49      0.57        55

    accuracy                           0.74       154
   macro avg       0.72      0.68      0.69       154
weighted avg       0.73      0.74      0.73       154

0.7402597402597403


In [None]:
#RANDOMIZED SEARCH CV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
#number of trees in random forest, we randomly take different number of trees given start from 200 and stop at 2000 that is between 200 and 2000 we take 10 equally spaced number of trees
n_estimators=[int(x) for x in np.linspace(start=200,stop=2000,num=10)]
#number of features to consider at every split
max_features=['auto','sqrt','log2']
#maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(10,1000,10)]
#minimum number of samples required to split a node
min_samples_split=[1,3,4,5,7,9]
#minimum number of samples required at each leaf node
min_samples_leaf=[1,2,4,6,8]
#create the random grid
random_grid={'n_estimators':n_estimators,'max_features':max_features,'max_depth':max_depth,'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf,'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 3, 4, 5, 7, 9], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [None]:
#after performing iterations in this space it tells what values of the parameters we should take out of the given values

In [None]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=100,n_jobs=-1)

#out of the above given parametrs the code will iterate for specified number of times, the number of iterations is 100 that is it will iterate 100 times taking some values from the given parameters, then cv=3 means cross validation is 3 that is 3 times the train and test split will occur therefore a total of 100*3 that is 300 fit will be performed
rf_randomcv.fit(x_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
#for taking the best parameters/estimators values
rf_randomcv.best_params_

In [None]:
#for taking the best estimators/parametrs values
rf_randomcv.best_estimator_

In [None]:
#finally creating the variable containing best estimator values
best_random_grid=rf_randomcv.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[87 12]
 [25 30]]
0.7597402597402597
              precision    recall  f1-score   support

           0       0.78      0.88      0.82        99
           1       0.71      0.55      0.62        55

    accuracy                           0.76       154
   macro avg       0.75      0.71      0.72       154
weighted avg       0.75      0.76      0.75       154

