In [2]:
import pickle
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.simplefilter(action = 'ignore')

In [3]:
df = pd.read_csv('diabetes_prediction_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
df.gender.value_counts()

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64

In [5]:
df.gender.drop(columns=['Other'], inplace=True)
df.head(2)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0


In [6]:
df.age.describe()

count    100000.000000
mean         41.885856
std          22.516840
min           0.080000
25%          24.000000
50%          43.000000
75%          60.000000
max          80.000000
Name: age, dtype: float64

In [7]:
age_limit = df.age.mean() - df.age.std()
df = df[df.age > age_limit]
df.head(1)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0


In [8]:
df.age.describe()

count    80332.000000
mean        49.781594
std         17.478524
min         20.000000
25%         35.000000
50%         49.000000
75%         63.000000
max         80.000000
Name: age, dtype: float64

In [9]:
df.hypertension.value_counts()

hypertension
0    72857
1     7475
Name: count, dtype: int64

In [10]:
df.heart_disease.value_counts()

heart_disease
0    76396
1     3936
Name: count, dtype: int64

In [11]:
df.smoking_history.value_counts()

smoking_history
never          30586
No Info        21747
former          9272
current         8997
not current     5777
ever            3953
Name: count, dtype: int64

In [12]:
category_mapping = {
    'never': 'NonSmoker',
    'No Info': 'NonSmoker',
    'former': 'Smoker',
    'current': 'Smoker',
    'not current': 'Smoker',
    'ever': 'NonSmoker'
}

In [13]:
df.smoking_history = df.smoking_history.replace(category_mapping)
df.smoking_history.value_counts()

smoking_history
NonSmoker    56286
Smoker       24046
Name: count, dtype: int64

In [14]:
df.head(2)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,NonSmoker,25.19,6.6,140,0
1,Female,54.0,0,0,NonSmoker,27.32,6.6,80,0


In [15]:
df.bmi.describe()

count    80332.000000
mean        28.711420
std          6.057618
min         10.010000
25%         25.700000
50%         27.320000
75%         30.750000
max         91.820000
Name: bmi, dtype: float64

In [16]:
df = df[df.bmi < 45]
df.bmi.describe()

count    78466.000000
mean        28.193928
std          5.032224
min         10.010000
25%         25.600000
50%         27.320000
75%         30.300000
max         44.990000
Name: bmi, dtype: float64

In [17]:
df.head(2)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,NonSmoker,25.19,6.6,140,0
1,Female,54.0,0,0,NonSmoker,27.32,6.6,80,0


In [18]:
# Normal: Below 5.7%
# Prediabetes: 5.7% to 6.4%
# Diabetes: 6.5% or higher
df.HbA1c_level.describe()

count    78466.000000
mean         5.549177
std          1.085981
min          3.500000
25%          4.800000
50%          5.800000
75%          6.200000
max          9.000000
Name: HbA1c_level, dtype: float64

In [19]:
df.blood_glucose_level.describe()

count    78466.000000
mean       139.035799
std         41.732167
min         80.000000
25%        100.000000
50%        140.000000
75%        159.000000
max        300.000000
Name: blood_glucose_level, dtype: float64

In [20]:
df = df[df.blood_glucose_level < 200]
df.shape

(69416, 9)

In [21]:
df.head(2)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,NonSmoker,25.19,6.6,140,0
1,Female,54.0,0,0,NonSmoker,27.32,6.6,80,0


In [22]:
# Doing One hot Encoding
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=True)
df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Male,gender_Other,smoking_history_Smoker
0,80.0,0,1,25.19,6.6,140,0,False,False,False
1,54.0,0,0,27.32,6.6,80,0,False,False,False
2,28.0,0,0,27.32,5.7,158,0,True,False,False
3,36.0,0,0,23.45,5.0,155,0,False,False,True
4,76.0,1,1,20.14,4.8,155,0,True,False,True


In [23]:
columns_to_standardize = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
standardizer = StandardScaler()
df[columns_to_standardize] = standardizer.fit_transform(df[columns_to_standardize])

In [24]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Male,gender_Other,smoking_history_Smoker
0,1.753296,0,1,-0.579774,1.063447,0.399355,0,False,False,False
1,0.268638,0,0,-0.151217,1.063447,-1.665987,0,False,False,False
2,-1.21602,0,0,-0.151217,0.202562,1.018957,0,True,False,False
3,-0.759202,0,0,-0.929863,-0.467015,0.91569,0,False,False,True
4,1.524887,1,1,-1.595837,-0.658323,0.91569,0,True,False,True


In [25]:
x = df.drop(columns='diabetes')
y = df.diabetes

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

In [27]:
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)
logistic_model.score(x_test, y_test)

0.9598818784212043

In [28]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
rf_model.score(x_test, y_test)

0.9631950446557188

In [29]:
# Cross Validation Score Averages
Score_1 = cross_val_score(LogisticRegression(), df.drop(columns=['diabetes']), df.diabetes, cv=5)
np.average(Score_1)

0.9594041681797542

In [30]:
Score_2 = cross_val_score(RandomForestClassifier(), df.drop(columns=['diabetes']), df.diabetes, cv=5)
np.average(Score_2)

0.9637979647709436

In [31]:
lr_clf = GridSearchCV(LogisticRegression(max_iter=150000), {
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'random_state': [None, 10]
    }, cv=5, return_train_score=False
)

lr_clf.fit(df.drop(columns=['diabetes']), df.diabetes)
print(lr_clf.best_params_) , print(lr_clf.best_score_)

{'penalty': 'l2', 'random_state': None, 'solver': 'liblinear'}
0.9594473844291137


(None, None)

In [32]:
rf_clf = GridSearchCV(RandomForestClassifier(), {
        'criterion': ['gini', 'entropy'],
        'bootstrap': [True, False],
        'random_state': [None, 10, 42]
    }, cv=5, return_train_score=False
)

rf_clf.fit(df.drop(columns=['diabetes']), df.diabetes)
print(rf_clf.best_params_)
print(rf_clf.best_score_)

{'bootstrap': True, 'criterion': 'gini', 'random_state': 10}
0.9640860858971362


In [33]:
rf_model = RandomForestClassifier(bootstrap=True, criterion='gini', random_state=10)
rf_model.fit(x_train, y_train)
rf_model.score(x_test, y_test)

0.9631950446557188

In [34]:
with open('model.pickle', 'wb') as f:
    pickle.dump(rf_model, f)

In [35]:
with open('scaler.pickle', 'wb') as f:
    pickle.dump(standardizer, f)

In [37]:
columns = {
    'data_columns': [col.lower() for col in x.columns]
}
with open('columns.json', 'w') as f:
    f.write(json.dumps(columns))