In [1]:
%load_ext ipydex.displaytools
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer, 
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay,\
    precision_score,recall_score, f1_score,roc_auc_score,roc_curve, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [9]:
df = pd.read_csv('../../Datasets/diabetes_data.csv')
df.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Data Exploration Curtesy of Solafa Jobi
# at https://www.kaggle.com/code/solafajobi/diabetes-perfect-prediction

#select variables that are medically likely to predict diabetes
dm = df[["Age","Sex","HighChol","BMI","Smoker","PhysActivity","PhysHlth","Fruits","Veggies","HvyAlcoholConsump","Stroke","HighBP","Diabetes"]]
dm.head()
#check unique values
unique_values = {}
for col in dm.columns:
    unique_values[col] = dm[col].value_counts().shape[0]
pd.DataFrame(unique_values, index=['unique value count']).transpose()
#check frequency of all values in the column
# All data columns except for color
feature_cols = [x for x in dm.columns if x not in 'stroke']
plt.figure(figsize=(25,35))
# loop for subplots
for i in range(len(feature_cols)):
    plt.subplot(8,5,i+1)
    plt.title(feature_cols[i])
    plt.xticks(rotation=90)
    plt.hist(dm[feature_cols[i]],color = "deepskyblue")
plt.tight_layout()
#we should drop the columns with very small categories- (HvyAlcoholConsump and stroke)
dm.drop(['HvyAlcoholConsump','Stroke'], axis=1, inplace=True)
#check correlation of other columns with diabetes column
dm.drop('Diabetes', axis=1).corrwith(dm.Diabetes)\
    .plot(kind='bar', grid=True, figsize=(10, 6), title="Correlation with Diabetes",color="deepskyblue")
#variables with correlation less than 0.1 are Sex, Smoker, Fruits, Veggies
# Correlation between any two features
# check for possible co-variates
sns.set(rc = {'figure.figsize':(10,10)})
sns.heatmap(dm.corr(),vmin=-1, vmax=1, annot = True, fmt='.1g',cmap= 'coolwarm')
#drop the variables with low correlations Sex, Smoker, Fruits, Veggies
dm.drop(['Sex','Smoker','Fruits','Veggies'], axis=1, inplace=True)
#narrowed down to 6 possible determinants 
#determine which predictors are more useful
# Bivariate bar plot for categorical variables
features = [x for x in dm.columns if x not in ['Age','BMI','PhysHlth','Diabetes']]
plt.figure(figsize = (30,23))
plt.suptitle('Diabetes by categorical features')
#subplots
for i in enumerate(features):
    plt.subplot(2,4, i[0]+1)   
    x = sns.countplot(data=dm, x=i[1], hue='Diabetes', palette = ['deepskyblue','crimson'])
    for z in x.patches:
      x.annotate('{:.1f}'.format((z.get_height()/dm.shape[0])*100)+'%',(z.get_x()+0.25, z.get_height()+0.01))
#for numeric variables
plt.figure(figsize=(12,5))
sns.displot(x='BMI', col='Diabetes' , data = dm, kind="kde" ,color = 'deepskyblue')
plt.figure(figsize=(12,20))
sns.displot(data=dm,col='Diabetes',x='Age',color='deepskyblue')
#Check skewness
#can only be checked for numeric data
dm_skew = dm[['Age','BMI','PhysHlth']]
skew = pd.DataFrame(dm_skew.skew())
skew.columns = ['skew']
skew['too_skewed'] = skew['skew'] > .75
skew
#BMI and PhysHlth are skewed. It needs to be transformed
#Scaling the data for features selection using the MinMaxScaler method.
#only numeric variables apply here
mms = MinMaxScaler()
dm[['BMI']] = mms.fit_transform(dm[['BMI']])
dm[['Age']] = mms.fit_transform(dm[['Age']])
dm[['PhysHlth']] = mms.fit_transform(dm[['PhysHlth']])
dm.head()


#Features selection -step 1
#1. Define X,y
y = (dm['Diabetes']).astype(int)
X = dm.loc[:, dm.columns != 'Diabetes']  # everything except "Diabetes"
#step 2
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
plt.figure(figsize=(8,6))
feat_importances.nlargest(6).plot(kind='barh')
plt.show()

#method 2   

#apply SelectKBest class to extract top 5 best features   #Do this before quantile transformation
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(6,'Score'))  #print 5 best features

#Method 3

#Create a logistic regression classifier
lr = LogisticRegression()
# Create an EFS object
efs = EFS(estimator=lr,        # Use logistic regression as the classifier/estimator
          min_features=1,      # The minimum number of features to consider is 1
          max_features=5,      # The maximum number of features to consider is 5
          scoring='accuracy',  # The metric to use to evaluate the classifier is accuracy 
          cv=4)                # The number of cross-validations to perform is 4

# Train EFS with our dataset
efs = efs.fit(X, y)
# Print the results
print('Best accuracy score: %.2f' % efs.best_score_) # best_score_ shows the best score 
print('Best subset (indices):', efs.best_idx_)       # best_idx_ shows the index of features that yield the best score 
print('Best subset (corresponding names):', efs.best_feature_names_) # best_feature_names_ shows the feature names
#recheck the skew
dm_skew = dm[['Age','BMI','PhysHlth']]
skew = pd.DataFrame(dm_skew.skew())
skew.columns = ['skew']
skew['too_skewed'] = skew['skew'] > .75
skew
#use quantile tranformation
qt = QuantileTransformer(n_quantiles=500, output_distribution='normal')
dm[['BMI']] = qt.fit_transform(dm[['BMI']])
dm[['PhysHlth']] = qt.fit_transform(dm[['PhysHlth']])
#recheck the skew
dm_skew = dm[['Age','BMI','PhysHlth']]
skew = pd.DataFrame(dm_skew.skew())
skew.columns = ['skew']
skew['too_skewed'] = skew['skew'] > .75
skew
dm.head()

In [None]:
# Model Building Curtesy of Solafa Jobi
# at https://www.kaggle.com/code/solafajobi/diabetes-perfect-prediction

#Data splitting

y = (dm['Diabetes']).astype(int)
X = dm.loc[:, dm.columns != 'stroke']  # everything except "stroke"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape
X_test.shape

#Predict with Decision tree, KNN and Extra Tree

In [None]:
from sklearn.model_selection import GridSearchCV
# defining parameter range
param_grid = {'n_neighbors': [1,3,5,7,9,11,13,15,17,19],  #odd numbers because there are 2 classes in target coulmn
              'weights': ['distance', 'uniform']}  
gridKNN = GridSearchCV(KNeighborsClassifier(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
gridKNN.fit(X_train, y_train)
print(gridKNN.best_params_)

#predict with the best parameter
y_pred_test = gridKNN.predict(X_test)
y_pred_train = gridKNN.predict(X_train)

#Check accuracy and overfitting
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))

#confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred_test, labels=gridKNN.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=gridKNN.classes_)
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues) 
plt.grid(which='major')     #remove cell gridlines
plt.gcf().set_size_inches(6, 6)   # Adjust the size of the plot
plt.show()

#model metrics

#function that get y_test and calculate into df all the relevant metric
def train_evaluate_model(y_test):
    #fit the model instance 
    predictions = y_pred_test # calculate predictions

    #compute metrics for evaluation
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(y_test, predictions)
    auc = roc_auc_score(y_test, predictions)

    #create a dataframe to visualize the results
    eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy, auc]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy', 'auc'])
    return eval_df

#model metrics

results = train_evaluate_model(y_test)
results.index = ['K Nearest Neighbors - Method 1']
results.style.background_gradient(cmap = sns.color_palette("blend:green,red", as_cmap=True))


In [None]:
dt = DecisionTreeClassifier(random_state=42)
dt = dt.fit(X_train, y_train)

#dt = DecisionTreeClassifier(random_state=42)
dt = dt.fit(X_train, y_train)

# defining parameter range
param_grid = {'max_depth':range(1, dt.tree_.max_depth+1, 2),
              'max_features': range(1, len(dt.feature_importances_)+1)}  
gridDT = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, n_jobs=-1)
  
# fitting the model for grid search
gridDT.fit(X_train, y_train)
print(gridDT.best_params_)

y_pred_test = gridDT.predict(X_test)
y_pred_train = gridDT.predict(X_train)
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))

#confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred_test, labels=gridDT.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=gridDT.classes_)
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues) 
plt.grid(which='major')     #remove cell gridlines
plt.gcf().set_size_inches(6, 6)   # Adjust the size of the plot
plt.show()

resultsDT = train_evaluate_model(y_test)
resultsDT.index = ['Decision Trees - Method 2']
results = results.append(resultsDT)
results.style.background_gradient(cmap = sns.color_palette("blend:red,green", as_cmap=True))

In [None]:
RF = RandomForestClassifier(oob_score=True, 
                            random_state=42, 
                            warm_start=True,
                            n_jobs=-1)

# defining parameter range
param_grid = {'n_estimators':[15, 20, 30, 40, 50, 100, 150, 200, 300, 400]
              }  
gridRF = GridSearchCV(RF, param_grid)
  
# fitting the model for grid search
gridRF.fit(X_train, y_train)
print(gridRF.best_params_)

y_pred_test = gridRF.predict(X_test)
y_pred_train = gridRF.predict(X_train)
print(accuracy_score(y_train, y_pred_train))
print(accuracy_score(y_test, y_pred_test))

#confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred_test, labels=gridRF.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=gridRF.classes_)
fig = plt.figure(figsize=(5, 5))
disp.plot(cmap=plt.cm.Blues) 
plt.grid(which='major')     #remove cell gridlines
plt.gcf().set_size_inches(6, 6)   # Adjust the size of the plot
plt.show()

In [None]:
resultsRF = train_evaluate_model(y_test)
resultsRF.index = ['Random Forest - Method 3']
results = results.append(resultsRF)
results.style.background_gradient(cmap = sns.color_palette("blend:red,green", as_cmap=True))