In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import SMOTE

In [2]:
x=pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')

In [3]:
y=x['Diabetes_binary']
x.drop('Diabetes_binary',inplace=True,axis=1)

In [4]:
## Standardize the dataset
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [5]:
print(Counter(y))

Counter({0.0: 218334, 1.0: 35346})


In [6]:
oversample = SMOTE()
x, y = oversample.fit_resample(x, y)

In [7]:
print(Counter(y))

Counter({0.0: 218334, 1.0: 218334})


In [8]:
x=scaler.fit_transform(x)

In [9]:
import pickle
pickle.dump(scaler,open('scaling.pkl','wb'))

In [10]:
from sklearn.model_selection import train_test_split
xtrain,xdiv,ytrain,ydiv=train_test_split(x,y,test_size=0.2,random_state=5)

## Model Training

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
rf1 = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=10,n_estimators=250, oob_score=True)
rf1.fit(xtrain,ytrain)
rf1_pred=rf1.predict(xdiv)
print(accuracy_score(ydiv,rf1_pred))

0.8665697208418256


In [19]:
pickle.dump(rf1,open('regmodel.pkl','wb'))

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

rf3 = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator=rf3,
                           param_grid=params,
                           cv = 4, n_jobs=-1, verbose=1, scoring="accuracy")
grid_search.fit(x, y)
grid_search.best_score_
rf_best = grid_search.best_estimator_
rf_best

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski')
classifier.fit(xtrain, ytrain)
knnpred=classifier.predict(xtest)
print(accuracy_score(ytest,knnpred))

In [14]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

In [15]:
abc=AdaBoostClassifier(n_estimators=79, learning_rate=0.1)
abc.fit(xtrain,ytrain)
pred_abc=abc.predict(xdiv)
print(accuracy_score(ydiv,pred_abc))

0.7993450431676095


In [16]:
model = AdaBoostClassifier()
grid = dict()
grid['n_estimators'] = [10, 50, 70,100,150]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0]
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
grid_result = grid_search.fit(x, y)
print(grid_result.best_score_, grid_result.best_params_)

KeyboardInterrupt: 

In [17]:
data=pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')
data.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [18]:
print(Counter(data['Diabetes_012']))

Counter({0.0: 213703, 2.0: 35346, 1.0: 4631})


In [20]:
data=pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [25]:
data.describe()


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,...,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.139333,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.094186,0.756544,0.634256,...,0.951053,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875
std,0.346294,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.292087,0.429169,0.481639,...,0.215759,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148
min,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0
50%,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0
75%,0.0,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0
max,1.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0
