In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('data/diabetes_small.csv')

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
print(data.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [5]:
print('glucose=0: ', data['Glucose'].value_counts()[0])
print('insulin=0:', data['Insulin'].value_counts()[0])
print('blood pressure=0: ', data['BloodPressure'].value_counts()[0])

glucose=0:  5
insulin=0: 374
blood pressure=0:  35


Data processing

In [6]:
new_data = data.copy(deep=True)
new_data[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = new_data[["Glucose", 
                            "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.NaN) 

In [7]:
new_data.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [8]:
new_data["Glucose"].fillna(new_data["Glucose"].mean(), inplace = True)
new_data["BloodPressure"].fillna(new_data["BloodPressure"].mean(), inplace = True)
new_data["SkinThickness"].fillna(new_data["SkinThickness"].mean(), inplace = True)
new_data["Insulin"].fillna(new_data["Insulin"].mean(), inplace = True)
new_data["BMI"].fillna(new_data["BMI"].mean(), inplace = True)

In [9]:
new_data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Model prediction

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

from sklearn.metrics import accuracy_score


In [11]:
y = new_data['Outcome']
x = new_data.drop('Outcome', axis=1)

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.20, random_state = 42, stratify = new_data['Outcome'] )

In [13]:
model_reg = LogisticRegression()
model_reg.fit(X_train, Y_train)

y_predict = model_reg.predict(X_test)

accuracy =accuracy_score(Y_test, y_predict)
accuracy

0.6948051948051948

In [14]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, Y_train)

y_predict = model_rf.predict(X_test)

accuracy = accuracy_score(Y_test, y_predict)
accuracy

0.7402597402597403

In [15]:
model_nb = GaussianNB()
model_nb.fit(X_train, Y_train)

y_predicted = model_nb.predict(X_test)

accuracy = accuracy_score(Y_test, y_predicted)
accuracy

0.6948051948051948

In [16]:
from sklearn.discriminant_analysis import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
#svm Classifier
model_svm = svm.SVC(kernel='poly') # Linear Kernel
model_svm.fit(X_train_scaled, Y_train)

y_pred = model_svm.predict(X_test_scaled)

accuracy = accuracy_score(Y_test, y_pred)
accuracy

0.7142857142857143

In [18]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_scaled, Y_train)

y_pred = knn.predict(X_test_scaled)
accuracy = accuracy_score(Y_test, y_pred)
accuracy

0.7467532467532467