In [112]:
#importing the libraries
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score 
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
import joblib as jb

In [53]:
data=pd.read_csv('diabetes.csv') #loading the data from diabetes.csv
print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [55]:
#renaming the o/p column
data.rename(columns={'Outcome':'Target'},inplace=True)
print(data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Target  
0                     0.627   50       1  
1                     0.351   31       0  
2                     0.672   32       1  
3                     0.167   21       0  
4                     2.288   33       1  


In [57]:
#describing the dataset and printing column information
print(data.describe())
print(data.info())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age      Target  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [59]:
#determining the number of the o/p values are yes or no 
data['Target'].value_counts()

Target
0    500
1    268
Name: count, dtype: int64

In [61]:
#data cleaning 
data['Target'].isnull().sum()
data['Target'].isna().sum()
data.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
763    False
764    False
765    False
766    False
767    False
Length: 768, dtype: bool

In [63]:
#As there are no null,duplicates and nan values the data set is cleaned

In [68]:
#Describing the Training and testing data 
X=data.drop('Target',axis=1)
Y=data['Target']

In [84]:
#Data standardization
scaler=StandardScaler()
Standardized_X=scaler.fit_transform(X)

In [88]:
print(Standardized_X)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Target, Length: 768, dtype: int64


In [92]:
#spliting the data set
X_train,X_test,Y_train,Y_test=train_test_split(Standardized_X,Y,test_size=0.2,random_state=42)

In [94]:
#Initializing,Training and Testing the model
model=SVC(kernel='linear')
model.fit(X_train,Y_train)
model.score(X_test,Y_test)


0.7597402597402597

In [96]:
#We got the model with 75% accuracy
Y_predict=model.predict(X_test)
accuracy=accuracy_score(Y_test,Y_predict)
print(accuracy)

0.7597402597402597


In [102]:
# Making a predictive system 
input_data=[2,197,70,45,543,30.5,0.158,53]
#Changing the data to numpy array
new_input_data=np.array(input_data)
reshaped_data=new_input_data.reshape(1,-1)
new_data=scaler.transform(reshaped_data)
print(new_data)
predicted_data=model.predict(new_data)
print(predicted_data)

[[0. 0. 0. 0. 0. 0. 0. 0.]]
[0]


In [106]:
#The model has predicted correctly 0-non diabetic 

In [114]:
#Saving the model
jb.dump(model,'diabetes_detector.pkl') 
print('Model saved successfully')

Model saved successfully
