In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.pandas.set_option('display.max_columns',0)

In [2]:
data=pd.read_csv("Data_For_Final_Model.csv",index_col=0)
data.sample(5,random_state=1)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
554,0.058824,0.258065,0.408163,0.173913,0.120337,0.58805,0.174589,0.116667,0
732,0.117647,0.83871,0.653061,0.326087,0.126354,0.827044,0.252332,0.05,1
321,0.176471,0.43871,0.510204,0.25,0.132371,0.421384,0.052865,0.066667,1
444,0.235294,0.470968,0.387755,0.054348,0.132371,0.361635,0.134163,0.15,1
392,0.058824,0.56129,0.408163,0.076087,0.481348,0.172956,0.138161,0.0,0


In [3]:
X=data.drop(['Outcome'],axis=1)
y=data.Outcome

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=2,
                                               stratify=y)

In [5]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(697, 8)
(299, 8)
(697,)
(299,)


In [6]:
print(f"Yes And No Value In Tain(y_Train)")
print(y_train.value_counts())

Yes And No Value In Tain(y_Train)
1    349
0    348
Name: Outcome, dtype: int64


In [7]:
print(f"Yes And No Value In Tain(y_Test)")
print(y_test.value_counts())

Yes And No Value In Tain(y_Test)
0    150
1    149
Name: Outcome, dtype: int64


### LogisticRegression

In [8]:
from sklearn.linear_model import LogisticRegression
log_model=LogisticRegression(random_state=34)

In [9]:
log_model.fit(X_train,y_train)

LogisticRegression(random_state=34)

In [10]:
prediction=log_model.predict(X_test)

In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.71      0.79      0.75       150
           1       0.76      0.68      0.72       149

    accuracy                           0.73       299
   macro avg       0.74      0.73      0.73       299
weighted avg       0.74      0.73      0.73       299



In [12]:
print(log_model.score(X_train,y_train))
print(log_model.score(X_test,y_test))

0.7388809182209469
0.7324414715719063


### RandomForest

In [13]:
from sklearn.ensemble import RandomForestClassifier
rfm=RandomForestClassifier(random_state=3)

In [14]:
rfm.fit(X_train,y_train)

RandomForestClassifier(random_state=3)

In [15]:
prediction=rfm.predict(X_test)
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.95      0.81      0.87       150
           1       0.83      0.96      0.89       149

    accuracy                           0.88       299
   macro avg       0.89      0.88      0.88       299
weighted avg       0.89      0.88      0.88       299



In [16]:
print(rfm.score(X_train,y_train))
print(rfm.score(X_test,y_test))

1.0
0.882943143812709


### Support Vector Machine

In [17]:
from sklearn.svm import SVC
classifier=SVC(random_state=4)

In [18]:
classifier.fit(X_train,y_train)  

SVC(random_state=4)

In [19]:
prediction=classifier.predict(X_test)
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.82      0.75      0.78       150
           1       0.77      0.84      0.80       149

    accuracy                           0.79       299
   macro avg       0.80      0.79      0.79       299
weighted avg       0.80      0.79      0.79       299



In [20]:
print(classifier.score(X_train,y_train))
print(classifier.score(X_test,y_test))

0.8034433285509326
0.7926421404682275


#### Summary Of Three Diff. ML Algorithm

1. LogisticRegression Accuracy is 73% .
2. RandomForest Accuracy is  89% .
3. Support Vector Machine Accuracy is 78% .

From the Above Analysis We can Extrapolate(conclude) that without Any Hyper-parameter tunning we got 89% Accuracy by using RandomForest Model. So, Let's use RandomForest Model for Hyper-Parameter to achieve Better Accuracy . 

#### Hyperparameter tuning Method
It is a Process of which help us findout the best parameter for our model.
1. GridSearchCV
2. RandomizedSearchCV
3. Bayesian Optimization-Automate Hyperparameter Tuning (Hyperopt)
4. Sequential model based optimization
5. Optuna-Automate Hyperparameter Tuning
6. Genetic Algorithm


In [21]:
print("Default Parameter Used By RandomForest \n",rfm.get_params())

Default Parameter Used By RandomForest 
 {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 3, 'verbose': 0, 'warm_start': False}


In [22]:
rfm=RandomForestClassifier(random_state=3)

In [23]:
from sklearn.model_selection import RandomizedSearchCV
rfm_grid={
    'n_estimators':np.arange(60,150,10),
    'criterion':['gini','entropy'],
    'max_features' : ["auto", "sqrt", "log2"]
  }
print(rfm_grid)

{'n_estimators': array([ 60,  70,  80,  90, 100, 110, 120, 130, 140]), 'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2']}


In [24]:
rfm_randomcv=RandomizedSearchCV(estimator=rfm,param_distributions=rfm_grid,n_jobs=-1,
                               random_state=20,verbose=1)
rfm_randomcv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(estimator=RandomForestClassifier(random_state=3), n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'n_estimators': array([ 60,  70,  80,  90, 100, 110, 120, 130, 140])},
                   random_state=20, verbose=1)

In [25]:
rfm_randomcv.best_params_

{'n_estimators': 100, 'max_features': 'auto', 'criterion': 'gini'}

Above Parameter And Default Parameter are Same. So, No need To of Parameter tuning.

In [26]:
prediction=rfm_randomcv.predict(X_test)
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.95      0.81      0.87       150
           1       0.83      0.96      0.89       149

    accuracy                           0.88       299
   macro avg       0.89      0.88      0.88       299
weighted avg       0.89      0.88      0.88       299



In [27]:
print(rfm_randomcv.score(X_train,y_train))
print(rfm_randomcv.score(X_test,y_test))

1.0
0.882943143812709


In [28]:
X.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [29]:
test_data_1=[[0.352941,0.387097,0.510204,0.119565,0.169675,0.367925,0.286095,0.333333
]]
test_data_2=[[0.588235,0.296774,0.622449,0.271739,0.132371,0.525157,0.331853,0.583333]]
test_data_3=[[0.058824,0.393548,0.346939,0.239130,0.132371,0.191824,0.048423,0.000000]]
test_data_4=[[0.294118,0.593548,0.591837,0.239130,0.132371,0.443396,0.249667,0.800000]]
test_data_5=[[0.588235,0.800000,0.510204,0.239130,0.132371,0.622642,0.203909,0.216667]]

In [30]:
print(rfm_randomcv.predict(test_data_1))
print(rfm_randomcv.predict(test_data_2))
print(rfm_randomcv.predict(test_data_3))
print(rfm_randomcv.predict(test_data_4))
print(rfm_randomcv.predict(test_data_5))

[1]
[1]
[0]
[1]
[1]


In [31]:
X_test.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
165,0.352941,0.387097,0.510204,0.119565,0.169675,0.367925,0.286095,0.333333
542,0.588235,0.296774,0.622449,0.271739,0.132371,0.525157,0.331853,0.583333
196,0.058824,0.393548,0.346939,0.23913,0.132371,0.191824,0.048423,0.0
684,0.294118,0.593548,0.591837,0.23913,0.132371,0.443396,0.249667,0.8
11,0.588235,0.8,0.510204,0.23913,0.132371,0.622642,0.203909,0.216667


In [32]:
y_test.head()

165    1
542    1
196    0
684    0
11     1
Name: Outcome, dtype: int64

In [33]:
#Row Number 5 (False)
test_data=[[1,89,66,23,94,28.1,0.167,21]]
print(rfm_randomcv.predict(test_data))

[1]


In [34]:
#Row Number 179(True)
test_data=[[0,129,110,46,130,67.1,0.319,26]]
print(rfm_randomcv.predict(test_data))

[1]


In [35]:
#Row Number 501 (True)
test_data=[[6,154,74,32,193,29.3,0.839,39]]
print(rfm_randomcv.predict(test_data))

[0]


In [36]:
#Row Number (False)
test_datat=[[2,82,52,22,115,28.5,1.699,25]]
print(rfm_randomcv.predict(test_data))

[0]


In [40]:
rfm_randomcv

AttributeError: 'RandomizedSearchCV' object has no attribute 'summary'

In [37]:
#Model Saving
import pickle
with open('Diabetes_Model','wb') as f:
    pickle.dump(rfm_randomcv,f)

In [38]:
#Testing the model
with open('Diabetes_Model','rb') as f:
    mod=pickle.load(f)

In [39]:
x=mod.predict([[6,154,74,32,193,29.3,0.839,39]])
x

array([0])