# Setup 

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# data

In [4]:
df = pd.read_csv("../diabetes.csv")

In [5]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
df.shape

(768, 9)

# Data Preprocessing

In [9]:
zero_pr = df.isin([0]).mean() * 100
zero_pr

Pregnancies                 14.453125
Glucose                      0.651042
BloodPressure                4.557292
SkinThickness               29.557292
Insulin                     48.697917
BMI                          1.432292
DiabetesPedigreeFunction     0.000000
Age                          0.000000
Outcome                     65.104167
dtype: float64

In [10]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
Pregnancies,6.0,1.0,8.0,1.0,0.0,5.0,3.0,10.0,2.0,8.0,...,1.0,6.0,2.0,9.0,9.0,10.0,2.0,5.0,1.0,1.0
Glucose,148.0,85.0,183.0,89.0,137.0,116.0,78.0,115.0,197.0,125.0,...,106.0,190.0,88.0,170.0,89.0,101.0,122.0,121.0,126.0,93.0
BloodPressure,72.0,66.0,64.0,66.0,40.0,74.0,50.0,0.0,70.0,96.0,...,76.0,92.0,58.0,74.0,62.0,76.0,70.0,72.0,60.0,70.0
SkinThickness,35.0,29.0,0.0,23.0,35.0,0.0,32.0,0.0,45.0,0.0,...,0.0,0.0,26.0,31.0,0.0,48.0,27.0,23.0,0.0,31.0
Insulin,0.0,0.0,0.0,94.0,168.0,0.0,88.0,0.0,543.0,0.0,...,0.0,0.0,16.0,0.0,0.0,180.0,0.0,112.0,0.0,0.0
BMI,33.6,26.6,23.3,28.1,43.1,25.6,31.0,35.3,30.5,0.0,...,37.5,35.5,28.4,44.0,22.5,32.9,36.8,26.2,30.1,30.4
DiabetesPedigreeFunction,0.627,0.351,0.672,0.167,2.288,0.201,0.248,0.134,0.158,0.232,...,0.197,0.278,0.766,0.403,0.142,0.171,0.34,0.245,0.349,0.315
Age,50.0,31.0,32.0,21.0,33.0,30.0,26.0,29.0,53.0,54.0,...,26.0,66.0,22.0,43.0,33.0,63.0,27.0,30.0,47.0,23.0
Outcome,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## converting 0 to null values

In [11]:
nc = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
df[nc] = df[nc].replace(0, np.NaN)

In [13]:
null_pr = df.isnull().mean()*100
null_pr

Pregnancies                 14.453125
Glucose                      0.651042
BloodPressure                4.557292
SkinThickness               29.557292
Insulin                     48.697917
BMI                          1.432292
DiabetesPedigreeFunction     0.000000
Age                          0.000000
Outcome                      0.000000
dtype: float64

In [23]:
df.drop("Insulin", axis = "columns", inplace = True)

In [24]:
df.shape

(768, 8)

In [25]:
data = df

# Missing Values

In [29]:
data.Pregnancies = data.Pregnancies.fillna(data.Pregnancies.median())
data.Glucose = data.Glucose.fillna(data.Glucose.median())
data.BloodPressure = data.BloodPressure.fillna(data.BloodPressure.mean())
data.SkinThickness = data.SkinThickness.fillna(data.SkinThickness.median())
data.BMI = data.BMI.fillna(data.BMI.median())
data.DiabetesPedigreeFunction = data.DiabetesPedigreeFunction.fillna(data.DiabetesPedigreeFunction.median())
data.Age = data.Age.fillna(data.Age.median())

In [30]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# Outliers

In [34]:
data_outlier_rem = data.copy()

In [36]:
data_outlier_rem = data_outlier_rem[data_outlier_rem['Pregnancies'].values < 8]

data_outlier_rem = data_outlier_rem[data_outlier_rem['Glucose'].values < 169]
data_outlier_rem = data_outlier_rem[data_outlier_rem['Glucose'].values > 50]

data_outlier_rem = data_outlier_rem[data_outlier_rem['BloodPressure'].values > 40]
data_outlier_rem = data_outlier_rem[data_outlier_rem['BloodPressure'].values < 100]

data_outlier_rem = data_outlier_rem[data_outlier_rem['SkinThickness'].values < 44]

data_outlier_rem = data_outlier_rem[data_outlier_rem['BMI'].values < 49]

data_outlier_rem = data_outlier_rem[data_outlier_rem['DiabetesPedigreeFunction'].values < 1]

data_outlier_rem = data_outlier_rem[data_outlier_rem['Age'].values < 37.5]

In [37]:
data_outlier_rem

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age,Outcome
1,1.0,85.0,66.0,29.0,26.6,0.351,31,0
3,1.0,89.0,66.0,23.0,28.1,0.167,21,0
5,5.0,116.0,74.0,29.0,25.6,0.201,30,0
6,3.0,78.0,50.0,32.0,31.0,0.248,26,1
10,4.0,110.0,92.0,29.0,37.6,0.191,30,0
...,...,...,...,...,...,...,...,...
758,1.0,106.0,76.0,29.0,37.5,0.197,26,0
760,2.0,88.0,58.0,26.0,28.4,0.766,22,0
764,2.0,122.0,70.0,27.0,36.8,0.340,27,0
765,5.0,121.0,72.0,23.0,26.2,0.245,30,0


# Features & Target

In [43]:
x = data_outlier_rem[data_outlier_rem.columns.drop("Outcome")]
y = data_outlier_rem['Outcome']

# Scaling

In [46]:
from sklearn.preprocessing import MinMaxScaler

In [48]:
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

# Train-Test Split

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

# Balancing Data

In [51]:
from imblearn.over_sampling import RandomOverSampler

In [52]:
ros = RandomOverSampler(random_state = 0)
x_new, y_new = ros.fit_resample(x_train, y_train)
x = x_new; y = y_new

# Modeling

## 1. Random Forest

In [55]:
from sklearn.ensemble import RandomForestClassifier

In [72]:
for_clf = RandomForestClassifier(max_depth = 2)
for_clf.fit(x_train, y_train)

RandomForestClassifier(max_depth=2)

# Validation

## 1. Cross Val

In [59]:
from sklearn.model_selection import cross_val_score

In [73]:
sum(cross_val_score(for_clf, x_train, y_train, cv = 5, scoring = "accuracy"))/5

0.8171561771561772

In [74]:
sum(cross_val_score(for_clf, x_test, y_test, cv = 5, scoring = "accuracy"))/5

0.7838235294117647

## 2. SVM

In [75]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [86]:
poly_svm_clf = Pipeline([
    ("scaler", MinMaxScaler()),
    ("svm_clf", LinearSVC())
])

poly_svm_clf.fit(x_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()), ('svm_clf', LinearSVC())])

In [87]:
sum(cross_val_score(poly_svm_clf, x_train, y_train, cv = 5, scoring = "accuracy"))/5

0.8353846153846153

In [88]:
sum(cross_val_score(poly_svm_clf, x_test, y_test, cv = 5, scoring = "accuracy"))/5

0.7595588235294117

## 3. LogisticRegression

In [89]:
from sklearn.linear_model import LogisticRegression

In [90]:
log_clf = LogisticRegression()
log_clf.fit(x_train, y_train)

LogisticRegression()

In [91]:
sum(cross_val_score(log_clf, x_train, y_train, cv = 5, scoring = "accuracy"))/5

0.8353379953379954

In [92]:
sum(cross_val_score(log_clf, x_test, y_test, cv = 5, scoring = "accuracy"))/5

0.7955882352941177

## 4. Ensemble 

In [93]:
from sklearn.ensemble import VotingClassifier

In [97]:
voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', for_clf), ('svc',poly_svm_clf)],
    voting = "soft"
)

voting_clf.fit(x_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier(max_depth=2)),
                             ('svc',
                              Pipeline(steps=[('scaler', MinMaxScaler()),
                                              ('svm_clf', LinearSVC())]))],
                 voting='soft')

In [98]:
sum(cross_val_score(log_clf, x_train, y_train, cv = 5, scoring = "accuracy"))/5

0.8353379953379954

In [99]:
sum(cross_val_score(log_clf, x_test, y_test, cv = 5, scoring = "accuracy"))/5

0.7955882352941177

In [111]:
! jt -f anonymous