### Importing the dependent libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import pickle

### Data Collection and Analysis

In [2]:
# PIMA Diabetes Dataset


In [3]:
# Loading the diaetes dataset to a pandas DataFrame

diabetes_df=pd.read_csv("diabetes.csv")

In [4]:
# Printing first 5 rows of the dataset

diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# Number of rows and columns in this dataframe

diabetes_df.shape

(768, 9)

In [6]:
# Getting the statistical measuresmof data

diabetes_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
# Replacing 0 value with mean value

diabetes_df[diabetes_df['Glucose']==0]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
75,1,0,48,20,0,24.7,0.14,22,0
182,1,0,74,20,23,27.7,0.299,21,0
342,1,0,68,35,0,32.0,0.389,22,0
349,5,0,80,32,0,41.0,0.346,37,1
502,6,0,68,41,0,39.0,0.727,41,1


In [8]:
# Replace 0 values with NaN

diabetes_df['Glucose'].replace(0, pd.NA, inplace=True)

In [9]:
# Compute the mean of each column

mean_values = diabetes_df['Glucose'].mean()

In [10]:
# Fill NaN values with mean of their respective columns

diabetes_df['Glucose'].fillna(mean_values, inplace=True)

In [11]:
# Changing for BloodPressure column

diabetes_df[diabetes_df['BloodPressure']==0].shape

(35, 9)

In [12]:
# Replace 0 values with NaN

diabetes_df['BloodPressure'].replace(0, pd.NA, inplace=True)

In [13]:
# Compute the mean of each column

mean_values = diabetes_df['BloodPressure'].mean()

In [14]:
# Fill NaN values with mean of their respective columns

diabetes_df['BloodPressure'].fillna(mean_values, inplace=True)

In [15]:
# Changing for BMI column

diabetes_df[diabetes_df['BMI']==0].shape

(11, 9)

In [16]:
# Replace 0 values with NaN

diabetes_df['BMI'].replace(0, pd.NA, inplace=True)

In [17]:
# Compute the mean of each column

mean_values = diabetes_df['BMI'].mean()

In [18]:
# Fill NaN values with mean of their respective columns

diabetes_df['BMI'].fillna(mean_values, inplace=True)

In [19]:
# To check how many women are diabetic from total records(768)

diabetes_df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [20]:
# 0 ---> Non Diabetic
# 1 ---> Diabetic

In [21]:
# To get more insights used groupby function with Outcome feature.

diabetes_df.groupby("Outcome").mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,110.710121,70.935397,19.664,68.792,30.888434,0.429734,31.19
1,4.865672,142.165573,75.147324,22.164179,100.335821,35.384757,0.5505,37.067164


In [22]:
# Separating the data and labels

X = diabetes_df.drop(columns = 'Outcome', axis=1)
Y = diabetes_df['Outcome']

In [23]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35,0,33.6,0.627,50
1,1,85.0,66.0,29,0,26.6,0.351,31
2,8,183.0,64.0,0,0,23.3,0.672,32
3,1,89.0,66.0,23,94,28.1,0.167,21
4,0,137.0,40.0,35,168,43.1,2.288,33


In [24]:
Y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

### SPLITING  Data

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [26]:
# Checking the shape of training and actual data

print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


### Standardization

In [27]:
scaler = StandardScaler()

In [28]:
X_train_scaled = scaler.fit_transform(X_train)

In [29]:
scaler_filename = 'scaler_object.pkl'
pickle.dump(scaler, open(scaler_filename, 'wb'))

In [30]:
X_test_scaled = scaler.transform(X_test)

### Training Data

In [31]:
classifier = svm.SVC(kernel='linear')

In [32]:
#training the support vector Machine Classifier
classifier.fit(X_train_scaled, Y_train)

### Model Evaluation

In [33]:
# Acuracy score on the training data

X_train_prediction = classifier.predict(X_train_scaled)
#training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
training_data_accuracy = accuracy_score(Y_train,X_train_prediction)

In [34]:
print("Acuracy score of the training data : ",training_data_accuracy)

Acuracy score of the training data :  0.7817589576547231


In [35]:
# Acuracy score on the test data

X_test_prediction = classifier.predict(X_test_scaled)
#testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [36]:
print("Acuracy score of the testing data : ",testing_data_accuracy)

Acuracy score of the testing data :  0.7792207792207793


### Making a predictive System

In [37]:
diabetes_df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [38]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(4), int64(5)
memory usage: 54.1 KB


In [39]:
# Taking input data from user

In [40]:
# Column_name_and_datatype=[('Pregnancies',int),('Glucose',int),('BloodPressure',int)
#                           ,('SkinThickness',int),('Insulin',int),('BMI',float),
#                           ('DiabetesPedigreeFunction',float),('Age',int)]
# input_data=[]

# for column, datatype in Column_name_and_datatype:
    
#     while True:
#         data=input(f"What is the value of {column}? ")
#         try:
#             data=datatype(data)
#             input_data.append(data)
#             break
#         except ValueError:
#             print(f"You have entered wrong value it should be {datatype.__name__}!")


In [41]:
# print(input_data)

In [42]:
# changing the input_data to numpyarray
#input_data=(9,119,80,35,0,29,0.263,29)#-->suppose yes
#input_data=(11,143,94,33,146,36.6,0.254,51)#--> suppose yes
#input_data=(4,134,72,0,0,23.8,0.277,60)#--> #suppose yes
#input_data=(7,160,54,32,175,30.5,0.588,39)# yes
#input_data=(4,123,62,0,0,32,0.226,35)# yes



#input_data=(2,107,74,30,100,33.6,0.404,23)#-->suppose no
#input_data=(4,151,90,38,0,29.7,0.294,36)#--> suppose no
#input_data=(4,141,74,0,0,27.6,0.244,40)#--> #suppose no
#input_data=(9,106,52,0,0,31.2,0.38,42)# no




#input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance

#input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# Standardize the input data

#std_data = scaler.transform(input_data_reshaped)

#prediction = classifier.predict(std_data)
#print(prediction)

# if prediction[0]==0:
#     print("This person is not diabetic.")
# else:
#     print("This person is diabetic.")

### Saving Trained Model

In [43]:
import pickle

In [44]:
filename = 'diabetes_model1.pkl'
pickle.dump(classifier, open(filename, 'wb'))

In [45]:
for i in diabetes_df.columns:
    print(i)

Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age
Outcome


In [46]:
input_data = (11,143,94,33,146,36.6,0.254,51)

print(type(input_data))

input_data_as_numpy_array = np.asarray(input_data)

#Reshape the array as we are predicting for one instance

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

#Standardize the input data

std_data = scaler.transform(input_data_reshaped)

prediction = classifier.predict(std_data)
print(prediction)

if prediction[0]==0:
    print("This person is not diabetic.")
else:
    print("This person is diabetic.")

<class 'tuple'>
[1]
This person is diabetic.


