In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv("/kaggle/input/easiest-diabetes-classification-dataset/Diabetes Classification.csv")

# Overall look over the dataset

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

# Plots pipeline

In [None]:
def plots(df, x, y, axes, i):
    group = df.groupby(y)
    sns.histplot(df, x=x, ax=axes[i][0], kde=True, color='g')
    sns.histplot(df, x=x, hue=y, ax=axes[i][1], kde=True)
    sns.barplot(x=group[x].mean().index,
                y=group[x].mean(),
               ax=axes[i][2])
    
    for container in axes[i][2].containers:
        axes[i][2].bar_label(container, size=15, color='black')
        
    axes[i][3].pie(df[y].value_counts(),
                  labels = df[y].value_counts().index,
                  autopct='%0.2f%%')

In [None]:
def plot_class(df, y, nums):
    length = len(nums)
    fig, axes = plt.subplots(nrows=length, ncols=4, figsize=(25, 10))
    for i in range(length):
        plots(df,nums[i],y,axes,i)
        
    plt.suptitle("Histplots, barplots and pie charts grouped by {}".format(y),
                size=20)
        
    plt.tight_layout()
    plt.show()

# Barplots and histograms

In [None]:
cats = ['Gender', 'Blood Pressure', 'Family History of Diabetes', 'Smoking', 'Diet', 'Exercise', 'Diagnosis']
nums = ['Age', 'BMI', 'FBS', 'HbA1c']

for c in cats:
    plot_class(df,c,nums)

# Pairplot

In [None]:
sns.pairplot(df,vars=nums,hue='Diagnosis',corner=True)

# Diabetes diagnosed in patients who have a poor diet

In [None]:
plot_class(df[df['Diet'] == 'Poor'],'Diagnosis',nums)

# Diabetes diagnosed in patients who have a healthy diet

In [None]:
plot_class(df[df['Diet'] == 'Healthy'], 'Diagnosis',nums)

# Diabetes diagnosed in patients who do not exercise

In [None]:
plot_class(df[df['Exercise'] == 'No'], 'Diagnosis',nums)

# Diabetes diagnosed in patients who exercise regularly

In [None]:
plot_class(df[df['Exercise'] == 'Regular'], 'Diagnosis',nums)

In [None]:
encoders = []
le = LabelEncoder()
for i in cats:
    df[i] = le.fit_transform(df[i])
    encoders += [le]

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

sm = SMOTE()
x_train, y_train = sm.fit_resample(x_train, y_train)

# Training and evaluation pipeline

In [None]:
def evaluate(model, name, _round=2):
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_pred, y_test)
    acc *= 100
    acc = round(acc, _round)
    title = "{} : {}%".format(model, acc)
    cm = confusion_matrix(y_pred, y_test)
    sns.heatmap(cm, annot=True)
    plt.title(title)
    plt.show()

def training(model, name):
    model.fit(x_train, y_train)
    evaluate(model, name)
    return model

# Defining models and tuning their hyperparameters

In [None]:
abc = AdaBoostClassifier(n_estimators=100,learning_rate=0.01)
rfc = RandomForestClassifier(n_estimators=110,max_depth=50)
dtc = DecisionTreeClassifier(max_depth=50)
knn = KNeighborsClassifier(n_neighbors=10)
svc = SVC(C=0.5,kernel='linear')
gnb = GaussianNB()
lgr = LogisticRegression()

models = [abc, rfc, dtc, knn, svc, gnb, lgr]
names = ['Ada Boost', 'Random Forest', 'Decision Tree', 'KNN', 'SVC', 'Naive Bayes', 'Logistic Regression']

# Results

In [None]:
model_trained = []
for i, j in zip(models, names):
    training(i, j)