In \[579\]:

    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns

    sns.set()

    from pandas.plotting import scatter_matrix
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier

    from sklearn.metrics import confusion_matrix
    from sklearn import metrics
    from sklearn.metrics import classification_report
    import warnings
    warnings.filterwarnings('ignore')
    %matplotlib inline

In \[580\]:

    diabetes_df = pd.read_csv('diabetes.csv')
    diabetes_df.head()

Out\[580\]:

|     | Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI  | DiabetesPedigreeFunction | Age | Outcome |
|-----|-------------|---------|---------------|---------------|---------|------|--------------------------|-----|---------|
| 0   | 6           | 148     | 72            | 35            | 0       | 33.6 | 0.627                    | 50  | 1       |
| 1   | 1           | 85      | 66            | 29            | 0       | 26.6 | 0.351                    | 31  | 0       |
| 2   | 8           | 183     | 64            | 0             | 0       | 23.3 | 0.672                    | 32  | 1       |
| 3   | 1           | 89      | 66            | 23            | 94      | 28.1 | 0.167                    | 21  | 0       |
| 4   | 0           | 137     | 40            | 35            | 168     | 43.1 | 2.288                    | 33  | 1       |

In \[581\]:

    diabetes_df.columns

Out\[581\]:

    Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
           'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
          dtype='object')

In \[582\]:

    diabetes_df.info()

    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 768 entries, 0 to 767
    Data columns (total 9 columns):
     #   Column                    Non-Null Count  Dtype  
    ---  ------                    --------------  -----  
     0   Pregnancies               768 non-null    int64  
     1   Glucose                   768 non-null    int64  
     2   BloodPressure             768 non-null    int64  
     3   SkinThickness             768 non-null    int64  
     4   Insulin                   768 non-null    int64  
     5   BMI                       768 non-null    float64
     6   DiabetesPedigreeFunction  768 non-null    float64
     7   Age                       768 non-null    int64  
     8   Outcome                   768 non-null    int64  
    dtypes: float64(2), int64(7)
    memory usage: 54.1 KB

In \[583\]:

    diabetes_df.describe()

Out\[583\]:

|       | Pregnancies | Glucose    | BloodPressure | SkinThickness | Insulin    | BMI        | DiabetesPedigreeFunction | Age        | Outcome    |
|-------|-------------|------------|---------------|---------------|------------|------------|--------------------------|------------|------------|
| count | 768.000000  | 768.000000 | 768.000000    | 768.000000    | 768.000000 | 768.000000 | 768.000000               | 768.000000 | 768.000000 |
| mean  | 3.845052    | 120.894531 | 69.105469     | 20.536458     | 79.799479  | 31.992578  | 0.471876                 | 33.240885  | 0.348958   |
| std   | 3.369578    | 31.972618  | 19.355807     | 15.952218     | 115.244002 | 7.884160   | 0.331329                 | 11.760232  | 0.476951   |
| min   | 0.000000    | 0.000000   | 0.000000      | 0.000000      | 0.000000   | 0.000000   | 0.078000                 | 21.000000  | 0.000000   |
| 25%   | 1.000000    | 99.000000  | 62.000000     | 0.000000      | 0.000000   | 27.300000  | 0.243750                 | 24.000000  | 0.000000   |
| 50%   | 3.000000    | 117.000000 | 72.000000     | 23.000000     | 30.500000  | 32.000000  | 0.372500                 | 29.000000  | 0.000000   |
| 75%   | 6.000000    | 140.250000 | 80.000000     | 32.000000     | 127.250000 | 36.600000  | 0.626250                 | 41.000000  | 1.000000   |
| max   | 17.000000   | 199.000000 | 122.000000    | 99.000000     | 846.000000 | 67.100000  | 2.420000                 | 81.000000  | 1.000000   |

In \[584\]:

    diabetes_df.isnull().head(10)

Out\[584\]:

|     | Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI   | DiabetesPedigreeFunction | Age   | Outcome |
|-----|-------------|---------|---------------|---------------|---------|-------|--------------------------|-------|---------|
| 0   | False       | False   | False         | False         | False   | False | False                    | False | False   |
| 1   | False       | False   | False         | False         | False   | False | False                    | False | False   |
| 2   | False       | False   | False         | False         | False   | False | False                    | False | False   |
| 3   | False       | False   | False         | False         | False   | False | False                    | False | False   |
| 4   | False       | False   | False         | False         | False   | False | False                    | False | False   |
| 5   | False       | False   | False         | False         | False   | False | False                    | False | False   |
| 6   | False       | False   | False         | False         | False   | False | False                    | False | False   |
| 7   | False       | False   | False         | False         | False   | False | False                    | False | False   |
| 8   | False       | False   | False         | False         | False   | False | False                    | False | False   |
| 9   | False       | False   | False         | False         | False   | False | False                    | False | False   |

In \[585\]:

    diabetes_df.isnull().sum()

Out\[585\]:

    Pregnancies                 0
    Glucose                     0
    BloodPressure               0
    SkinThickness               0
    Insulin                     0
    BMI                         0
    DiabetesPedigreeFunction    0
    Age                         0
    Outcome                     0
    dtype: int64

In \[586\]:

    diabetes_df_copy = diabetes_df.copy(deep = True)
    diabetes_df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diabetes_df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

    # Showing the Count of NANs
    print(diabetes_df_copy.isnull().sum())

    Pregnancies                   0
    Glucose                       5
    BloodPressure                35
    SkinThickness               227
    Insulin                     374
    BMI                          11
    DiabetesPedigreeFunction      0
    Age                           0
    Outcome                       0
    dtype: int64

In \[587\]:

    p = diabetes_df.hist(figsize = (20,20))

![](attachment:vertopal_5fd938210cd741f1ab47a680760db8b1/08813fb60f09836f4267ee0df70ee7742d293209.png)

In \[588\]:

    diabetes_df_copy['Glucose'].fillna(diabetes_df_copy['Glucose'].mean(), inplace = True)
    diabetes_df_copy['BloodPressure'].fillna(diabetes_df_copy['BloodPressure'].mean(), inplace = True)
    diabetes_df_copy['SkinThickness'].fillna(diabetes_df_copy['SkinThickness'].median(), inplace = True)
    diabetes_df_copy['Insulin'].fillna(diabetes_df_copy['Insulin'].median(), inplace = True)
    diabetes_df_copy['BMI'].fillna(diabetes_df_copy['BMI'].median(), inplace = True)

In \[589\]:

    p = diabetes_df_copy.hist(figsize = (20,20))

![](attachment:vertopal_5fd938210cd741f1ab47a680760db8b1/a92e8ac92477a0edfedf0a02e965c64abc4cfefa.png)

In \[590\]:

    diabetes_df_copy.head()

Out\[590\]:

|     | Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI  | DiabetesPedigreeFunction | Age | Outcome |
|-----|-------------|---------|---------------|---------------|---------|------|--------------------------|-----|---------|
| 0   | 6           | 148.0   | 72.0          | 35.0          | 125.0   | 33.6 | 0.627                    | 50  | 1       |
| 1   | 1           | 85.0    | 66.0          | 29.0          | 125.0   | 26.6 | 0.351                    | 31  | 0       |
| 2   | 8           | 183.0   | 64.0          | 29.0          | 125.0   | 23.3 | 0.672                    | 32  | 1       |
| 3   | 1           | 89.0    | 66.0          | 23.0          | 94.0    | 28.1 | 0.167                    | 21  | 0       |
| 4   | 0           | 137.0   | 40.0          | 35.0          | 168.0   | 43.1 | 2.288                    | 33  | 1       |

In \[591\]:

    sc_X = StandardScaler()
    X =  pd.DataFrame(sc_X.fit_transform(diabetes_df_copy.drop(["Outcome"],axis = 1),), columns=['Pregnancies', 
    'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])
    X.head()

Out\[591\]:

|     | Pregnancies | Glucose   | BloodPressure | SkinThickness | Insulin   | BMI       | DiabetesPedigreeFunction | Age       |
|-----|-------------|-----------|---------------|---------------|-----------|-----------|--------------------------|-----------|
| 0   | 0.639947    | 0.865108  | -0.033518     | 0.670643      | -0.181541 | 0.166619  | 0.468492                 | 1.425995  |
| 1   | -0.844885   | -1.206162 | -0.529859     | -0.012301     | -0.181541 | -0.852200 | -0.365061                | -0.190672 |
| 2   | 1.233880    | 2.015813  | -0.695306     | -0.012301     | -0.181541 | -1.332500 | 0.604397                 | -0.105584 |
| 3   | -0.844885   | -1.074652 | -0.529859     | -0.695245     | -0.540642 | -0.633881 | -0.920763                | -1.041549 |
| 4   | -1.141852   | 0.503458  | -2.680669     | 0.670643      | 0.316566  | 1.549303  | 5.484909                 | -0.020496 |

In \[592\]:

    y = diabetes_df_copy.Outcome #traget coloumn
    y

Out\[592\]:

    0      1
    1      0
    2      1
    3      0
    4      1
          ..
    763    0
    764    0
    765    0
    766    1
    767    0
    Name: Outcome, Length: 768, dtype: int64

In \[593\]:

    X = diabetes_df.drop('Outcome', axis=1)
    y = diabetes_df['Outcome']

In \[594\]:

    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25,
                                                        random_state=0)

In \[595\]:

    from sklearn.ensemble import RandomForestClassifier

    rfc = RandomForestClassifier(n_estimators=200)
    rfc.fit(X_train, y_train)

Out\[595\]:

    RandomForestClassifier(n_estimators=200)

In \[596\]:

    rfc_train = rfc.predict(X_train)
    from sklearn import metrics

    print("Accuracy_Score =", format(metrics.accuracy_score(y_train, rfc_train)))

    Accuracy_Score = 1.0

In \[597\]:

    from sklearn import metrics

    predictions = rfc.predict(X_test)
    print("Accuracy_Score =", format(metrics.accuracy_score(y_test, predictions)))

    Accuracy_Score = 0.796875

In \[598\]:

    from sklearn.metrics import classification_report, confusion_matrix

    print(confusion_matrix(y_test, predictions))
    print(classification_report(y_test,predictions))

    [[117  13]
     [ 26  36]]
                  precision    recall  f1-score   support

               0       0.82      0.90      0.86       130
               1       0.73      0.58      0.65        62

        accuracy                           0.80       192
       macro avg       0.78      0.74      0.75       192
    weighted avg       0.79      0.80      0.79       192

In \[599\]:

    from sklearn.tree import DecisionTreeClassifier

    dtree = DecisionTreeClassifier()
    dtree.fit(X_train, y_train)

Out\[599\]:

    DecisionTreeClassifier()

In \[600\]:

    from sklearn import metrics

    predictions = dtree.predict(X_test)
    print("Accuracy Score =", format(metrics.accuracy_score(y_test,predictions)))

    Accuracy Score = 0.7239583333333334

In \[601\]:

    from sklearn.metrics import classification_report, confusion_matrix

    print(confusion_matrix(y_test, predictions))
    print(classification_report(y_test,predictions))

    [[100  30]
     [ 23  39]]
                  precision    recall  f1-score   support

               0       0.81      0.77      0.79       130
               1       0.57      0.63      0.60        62

        accuracy                           0.72       192
       macro avg       0.69      0.70      0.69       192
    weighted avg       0.73      0.72      0.73       192

In \[602\]:

    from xgboost import XGBClassifier

    xgb_model = XGBClassifier(gamma=0)
    xgb_model.fit(X_train, y_train)

    [20:51:08] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.0/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

Out\[602\]:

    XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                  colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
                  gamma=0, gpu_id=-1, importance_type=None,
                  interaction_constraints='', learning_rate=0.300000012,
                  max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
                  monotone_constraints='()', n_estimators=100, n_jobs=8,
                  num_parallel_tree=1, predictor='auto', random_state=0,
                  reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
                  tree_method='exact', validate_parameters=1, verbosity=None)

In \[603\]:

    from sklearn import metrics

    xgb_pred = xgb_model.predict(X_test)
    print("Accuracy Score =", format(metrics.accuracy_score(y_test, xgb_pred)))

    Accuracy Score = 0.7604166666666666

In \[604\]:

    from sklearn.metrics import classification_report, confusion_matrix

    print(confusion_matrix(y_test, xgb_pred))
    print(classification_report(y_test,xgb_pred))

    [[107  23]
     [ 23  39]]
                  precision    recall  f1-score   support

               0       0.82      0.82      0.82       130
               1       0.63      0.63      0.63        62

        accuracy                           0.76       192
       macro avg       0.73      0.73      0.73       192
    weighted avg       0.76      0.76      0.76       192

In \[605\]:

    from sklearn.svm import SVC

    svc_model = SVC()
    svc_model.fit(X_train, y_train)

Out\[605\]:

    SVC()

In \[606\]:

    svc_pred = svc_model.predict(X_test)

In \[607\]:

    from sklearn import metrics

    print("Accuracy Score =", format(metrics.accuracy_score(y_test, svc_pred)))

    Accuracy Score = 0.7708333333333334

In \[608\]:

    from sklearn.metrics import classification_report, confusion_matrix

    print(confusion_matrix(y_test, svc_pred))
    print(classification_report(y_test,svc_pred))

    [[119  11]
     [ 33  29]]
                  precision    recall  f1-score   support

               0       0.78      0.92      0.84       130
               1       0.72      0.47      0.57        62

        accuracy                           0.77       192
       macro avg       0.75      0.69      0.71       192
    weighted avg       0.76      0.77      0.76       192

In \[609\]:

    rfc.feature_importances_

Out\[609\]:

    array([0.0782693 , 0.25448658, 0.09015382, 0.07056105, 0.07602288,
           0.15848283, 0.12676865, 0.14525489])

In \[610\]:

    (pd.Series(rfc.feature_importances_, index=X.columns).plot(kind='barh'))

Out\[610\]:

    <AxesSubplot:>

![](attachment:vertopal_5fd938210cd741f1ab47a680760db8b1/e397c0e5b4da12062a3f9faa2a37bfbe45b9bd06.png)

In \[611\]:

    import pickle

    # Firstly we will be using the dump() function to save the model using pickle
    saved_model = pickle.dumps(rfc)

    # Then we will be loading that saved model
    rfc_from_pickle = pickle.loads(saved_model)

    # lastly, after loading that model we will use this to make predictions
    rfc_from_pickle.predict(X_test)

Out\[611\]:

    array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
           0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
           1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
           1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
           0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
           0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0], dtype=int64)

In \[612\]:

    diabetes_df.head()

Out\[612\]:

|     | Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI  | DiabetesPedigreeFunction | Age | Outcome |
|-----|-------------|---------|---------------|---------------|---------|------|--------------------------|-----|---------|
| 0   | 6           | 148     | 72            | 35            | 0       | 33.6 | 0.627                    | 50  | 1       |
| 1   | 1           | 85      | 66            | 29            | 0       | 26.6 | 0.351                    | 31  | 0       |
| 2   | 8           | 183     | 64            | 0             | 0       | 23.3 | 0.672                    | 32  | 1       |
| 3   | 1           | 89      | 66            | 23            | 94      | 28.1 | 0.167                    | 21  | 0       |
| 4   | 0           | 137     | 40            | 35            | 168     | 43.1 | 2.288                    | 33  | 1       |

In \[613\]:

    diabetes_df.tail()

Out\[613\]:

|     | Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI  | DiabetesPedigreeFunction | Age | Outcome |
|-----|-------------|---------|---------------|---------------|---------|------|--------------------------|-----|---------|
| 763 | 10          | 101     | 76            | 48            | 180     | 32.9 | 0.171                    | 63  | 0       |
| 764 | 2           | 122     | 70            | 27            | 0       | 36.8 | 0.340                    | 27  | 0       |
| 765 | 5           | 121     | 72            | 23            | 112     | 26.2 | 0.245                    | 30  | 0       |
| 766 | 1           | 126     | 60            | 0             | 0       | 30.1 | 0.349                    | 47  | 1       |
| 767 | 1           | 93      | 70            | 31            | 0       | 30.4 | 0.315                    | 23  | 0       |

In \[614\]:

    rfc.predict([[0,137,40,35,168,43.1,2.228,33]]) #4th patient

Out\[614\]:

    array([1], dtype=int64)

In \[615\]:

    rfc.predict([[10,101,76,48,180,32.9,0.171,63]])  # 763 th patient

Out\[615\]:

    array([0], dtype=int64)