# Importing Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.describe().T

#statistical summary of the variables

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
# Check whether data is balanced or imbalanced
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

#### Somewhat unbalanced dataset
#### 0--> Non-diabetic
#### 1--> diabetic

In [6]:
df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


# Features and Target

In [7]:
X=df.iloc[:,:-1].values
y=df.iloc[:,-1].values

# Scaling the data

In [8]:
from sklearn.preprocessing import MinMaxScaler

In [9]:
scaler=MinMaxScaler()

In [10]:
X=scaler.fit_transform(X) # both fitting and transforming the data

## Train Test Split 

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=21)

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(576, 8)
(192, 8)
(576,)
(192,)


# Training the model

In [14]:
from sklearn.svm import SVC

In [15]:
svc_classifier=SVC(kernel='linear')

In [16]:
svc_classifier.fit(X_train,y_train)

SVC(kernel='linear')

In [17]:
y_pred=svc_classifier.predict(X_test)

# Model evaluation

In [18]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [19]:
accuracy_score(y_test,y_pred)

0.765625

In [20]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.91      0.83       121
           1       0.77      0.52      0.62        71

    accuracy                           0.77       192
   macro avg       0.77      0.72      0.73       192
weighted avg       0.77      0.77      0.75       192



# Finding whether model has overfitted or not?

In [21]:
svc_classifier.score(X_train,y_train)

0.78125

In [22]:
svc_classifier.score(X_test,y_test)

0.765625

So we see that the training and testing accuracies are quite similar. So the fitting is just right.

# Making a Predictive System

In [48]:
#Enter in order: Pregnancies, Glucose,Blood Pressure, Skin Thickness,Insulin,BMI,Diabetes_Pedigree_Function, Age
input=[7,125,86,0,0,37.6,0.304,51]

#Converting input to 2D numpy array of only 1 row, since model.predict takes 2D array as input
input_array=np.array(input).reshape(1,-1)

# Scale the input by fit_transform
input_scaled=scaler.transform(input_array)

y_pred=svc_classifier.predict(input_scaled)

if y_pred[0]==0:
    print("Person is not Diabetic")
else:
    print("Person is Diabetic")

Person is not Diabetic


# GridSearchCV

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
parameters={'C':[0.1,1,10,100],
           'kernel':['linear','poly','rbf','sigmoid']}

In [25]:
model=SVC()

In [26]:
grid=GridSearchCV(model,param_grid=parameters,cv=5,scoring='accuracy')

In [27]:
grid.fit(X,y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             scoring='accuracy')

In [28]:
print(grid.best_params_)
print('\n')
print(grid.best_score_)

{'C': 1, 'kernel': 'poly'}


0.7799592564298446


In [29]:
grid_results=pd.DataFrame(grid.cv_results_)

In [30]:
grid_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009346,0.007631,0.0,0.0,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.688312,0.707792,0.668831,0.679739,0.666667,0.682268,0.014956,12
1,0.012498,0.006249,0.0,0.0,0.1,poly,"{'C': 0.1, 'kernel': 'poly'}",0.779221,0.753247,0.779221,0.803922,0.764706,0.776063,0.017004,4
2,0.012497,0.006248,0.006249,0.007653,0.1,rbf,"{'C': 0.1, 'kernel': 'rbf'}",0.766234,0.727273,0.785714,0.777778,0.764706,0.764341,0.020076,9
3,0.018739,0.006251,0.003124,0.006248,0.1,sigmoid,"{'C': 0.1, 'kernel': 'sigmoid'}",0.616883,0.649351,0.616883,0.620915,0.647059,0.630218,0.014777,13
4,0.009373,0.007653,0.003132,0.006263,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.766234,0.75974,0.75974,0.803922,0.75817,0.769561,0.017403,7


In [31]:
grid_results[['params','mean_test_score','std_test_score','rank_test_score']].sort_values(by='mean_test_score',ascending=False)

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
5,"{'C': 1, 'kernel': 'poly'}",0.779959,0.01097,1
9,"{'C': 10, 'kernel': 'poly'}",0.779959,0.008312,1
6,"{'C': 1, 'kernel': 'rbf'}",0.776123,0.030127,3
1,"{'C': 0.1, 'kernel': 'poly'}",0.776063,0.017004,4
10,"{'C': 10, 'kernel': 'rbf'}",0.774807,0.023589,5
12,"{'C': 100, 'kernel': 'linear'}",0.773483,0.021911,6
4,"{'C': 1, 'kernel': 'linear'}",0.769561,0.017403,7
8,"{'C': 10, 'kernel': 'linear'}",0.765665,0.022569,8
2,"{'C': 0.1, 'kernel': 'rbf'}",0.764341,0.020076,9
13,"{'C': 100, 'kernel': 'poly'}",0.761727,0.013868,10
