In [1]:
# import modules
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
# how many data has the outcome of 1
df.query('Outcome == 1').shape

(268, 9)

In [4]:
# how many data has the outcome of 0
df.query('Outcome == 0').shape

(500, 9)

In [5]:
df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [6]:
df.groupby('Outcome').std()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.017185,26.1412,18.063075,14.889947,98.865289,7.689855,0.299085,11.667655
1,3.741239,31.939622,21.491812,17.679711,138.689125,7.262967,0.372354,10.968254


In [7]:
# separate the data from df into X and y
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [8]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [9]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [10]:
# standardized the X data
scaler = StandardScaler()

In [11]:
scaler.fit(X)

In [12]:
X = scaler.transform(X)
X

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [13]:
# split the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=123)

In [14]:
X_train.shape

(614, 8)

In [15]:
X_test.shape

(154, 8)

In [16]:
y_train.shape

(614,)

In [17]:
y_test.shape

(154,)

In [18]:
# training the data
model = svm.SVC(kernel='linear')
model.fit(X_train, y_train)

In [19]:
# get accuracy of the model
y_train_prediction = model.predict(X_train)
accuracy_train = accuracy_score(y_train_prediction, y_train)
print('Accuracy of the model on training data is', accuracy_train)

Accuracy of the model on training data is 0.7768729641693811


In [20]:
y_test_prediction = model.predict(X_test)
accuracy_test = accuracy_score(y_test_prediction, y_test)
print('Accuracy of the model on training data is', accuracy_test)

Accuracy of the model on training data is 0.7597402597402597


In [32]:
# make a prediction on new data
inputs = [1, 10, 30, 40, 93, 38, 0.12, 10]
inputs = np.array(inputs)
inputs = inputs.reshape(1, -1)
prediction = model.predict(inputs)

if prediction == 0:
    print("You don't have diabetes.")
else:
    print("You have diabetes.")

You have diabetes.


In [33]:
inputs = []
questions = ["How many pregnancies? ", "What is your glucose level? ", "What is your blood pressure? ", "What is your skin thickness? ", 
             "What is your insulin level? ", "What is your BMI? ", "What is your diabetes pedigree function? ", "What is your age? "]

while True:
    for question in range(len(questions)):
        user_input = input(questions[question])
        inputs.append(float(user_input))
    
    inputs = np.array(inputs)
    inputs = inputs.reshape(1, -1)
    prediction = model.predict(inputs)
    
    if prediction == 0:
        print("You don't have diabetes.")
    else:
        print("You have diabetes.")

    quit_or_not = input("\nTest again? ")
    if quit_or_not.lower() == "no":
        break

How many pregnancies?  0
What is your glucose level?  0
What is your blood pressure?  0
What is your skin thickness?  0
What is your insulin level?  0
What is your BMI?  0
What is your diabetes pedigree function?  0
What is your age?  0


You don't have diabetes.



Test again?  no
