In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn import svm

In [2]:
data = pd.read_csv('diabetes.csv')

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
data.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [7]:
data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [8]:
X = data.drop(columns = 'Outcome',axis=1)
Y = data['Outcome']

In [9]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [10]:
Y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [11]:
#1->tested positive for diabetes 
#0->tested negative for diabetes 

In [12]:
scaler = StandardScaler()

In [13]:
scaler.fit(X)

StandardScaler()

In [14]:
standardized_data = scaler.transform(X)

In [15]:
standardized_data

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [16]:
X = standardized_data
Y = data['Outcome']

In [17]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.3,stratify = Y ,random_state=78)

In [18]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(768, 8)
(537, 8)
(231, 8)


LOGISTIC REGRESSION

In [19]:
model_log = LogisticRegression()

In [20]:
model_log.fit(X_train,Y_train)

LogisticRegression()

In [21]:
x_train_prediction = model_log.predict(X_train)

In [23]:
accuracy_score = metrics.accuracy_score(x_train_prediction,Y_train)

In [24]:
f1_score = metrics.f1_score(x_train_prediction,Y_train)

In [25]:
print(accuracy_score*100)
print(f1_score*100)

78.2122905027933
65.28189910979229


In [26]:
x_test_prediction = model_log.predict(X_test)

In [27]:
accuracy_score_test = metrics.accuracy_score(x_test_prediction,Y_test)

In [28]:
f1_score_test = metrics.f1_score(x_test_prediction,Y_test)

In [29]:
print(accuracy_score_test*100)
print(f1_score_test*100)

74.89177489177489
61.33333333333333


In [30]:
#building a prediction system
input_data = (6,148,72,35,0,33.6,0.627,50)

array = np.asarray(input_data)

array_reshape = array.reshape(1,-1) #we want to make the prediction for only one value

prediction_log = model_log.predict(array_reshape)

In [31]:
prediction_log

array([1], dtype=int64)

In [32]:
if(prediction_log[0]==[0]):
    print("The patient doesn't have diabetes")
else:
    print("The patient has diabetes")

The patient has diabetes


SUPPORT VECTOR MACHINE

In [33]:
classifier = svm.SVC(kernel='linear')

In [34]:
classifier.fit(X_train,Y_train)

SVC(kernel='linear')

In [35]:
x_train_prediction = classifier.predict(X_train)

In [37]:
training_accuracy = metrics.accuracy_score(Y_train,x_train_prediction)

In [38]:
training_f1_score = metrics.f1_score(Y_train,x_train_prediction)

In [39]:
print(training_accuracy*100)
print(training_f1_score*100)

78.02607076350093
64.67065868263472


In [41]:
classifier.fit(X_test,Y_test)

SVC(kernel='linear')

In [42]:
x_test_prediction = classifier.predict(X_test)

In [43]:
test_accuracy = metrics.accuracy_score(Y_test,x_test_prediction)

In [44]:
test_f1_score = metrics.f1_score(Y_test,x_test_prediction)

In [45]:
print(test_accuracy*100)
print(test_f1_score*100)

77.48917748917748
60.0


In [46]:
input_data = (6,148,72,35,0,33.6,0.627,50)

array = np.asarray(input_data)

array_reshape = array.reshape(1,-1) #we want to make the prediction for only one value

prediction_svm = classifier.predict(array_reshape)

In [47]:
prediction_svm

array([1], dtype=int64)

In [48]:
if(prediction_svm[0]==[0]):
    print("The patient doesn't have diabetes")
else:
    print("The patient has diabetes")

The patient has diabetes


In [56]:
print("Logistic Regression : ")
print("Accuracy Score of Logistic Regression :",accuracy_score*100,"%")
print("F1 Score of Logistic Regression :",f1_score*100,"%")
print("Support Vector Machine :")
print("Accuracy Score of SVM : " ,training_accuracy*100,"%")
print("F1 Score of SVM : " ,training_f1_score*100,"%")

Logistic Regression : 
Accuracy Score of Logistic Regression : 78.2122905027933 %
F1 Score of Logistic Regression : 65.28189910979229 %
Support Vector Machine :
Accuracy Score of SVM :  78.02607076350093 %
F1 Score of SVM :  64.67065868263472 %
