In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
# Load dataset
dff = pd.read_csv('diabetes_dataset00.csv')
dff.head()

Unnamed: 0,Target,Genetic Markers,Autoantibodies,Family History,Environmental Factors,Insulin Levels,Age,BMI,Physical Activity,Dietary Habits,...,Pulmonary Function,Cystic Fibrosis Diagnosis,Steroid Use History,Genetic Testing,Neurological Assessments,Liver Function Tests,Digestive Enzyme Levels,Urine Test,Birth Weight,Early Onset Symptoms
0,Steroid-Induced Diabetes,Positive,Negative,No,Present,40,44,38,High,Healthy,...,76,No,No,Positive,3,Normal,56,Ketones Present,2629,No
1,Neonatal Diabetes Mellitus (NDM),Positive,Negative,No,Present,13,1,17,High,Healthy,...,60,Yes,No,Negative,1,Normal,28,Glucose Present,1881,Yes
2,Prediabetic,Positive,Positive,Yes,Present,27,36,24,High,Unhealthy,...,80,Yes,No,Negative,1,Abnormal,55,Ketones Present,3622,Yes
3,Type 1 Diabetes,Negative,Positive,No,Present,8,7,16,Low,Unhealthy,...,89,Yes,No,Positive,2,Abnormal,60,Ketones Present,3542,No
4,Wolfram Syndrome,Negative,Negative,Yes,Present,17,10,17,High,Healthy,...,41,No,No,Positive,1,Normal,24,Protein Present,1770,No


In [3]:
# Check shape and missing value
print(dff.shape)

dff.info()

(70000, 34)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 34 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Target                         70000 non-null  object
 1   Genetic Markers                70000 non-null  object
 2   Autoantibodies                 70000 non-null  object
 3   Family History                 70000 non-null  object
 4   Environmental Factors          70000 non-null  object
 5   Insulin Levels                 70000 non-null  int64 
 6   Age                            70000 non-null  int64 
 7   BMI                            70000 non-null  int64 
 8   Physical Activity              70000 non-null  object
 9   Dietary Habits                 70000 non-null  object
 10  Blood Pressure                 70000 non-null  int64 
 11  Cholesterol Levels             70000 non-null  int64 
 12  Waist Circumference            70000 non-null  i

In [4]:
# Separate target and feature
features_dff = dff.iloc[:,1:]
target_dff = dff['Target']

In [5]:
# Import label encoder and standard scaler
from sklearn.preprocessing import LabelEncoder, StandardScaler

encoder = LabelEncoder()
scaler = StandardScaler()

## Select categorical dtypes columns
cat_dff = features_dff.select_dtypes(include='object')
cat_cols = list(cat_dff.columns)

## Encode the categorical columns
for col in cat_cols:
    features_dff[col] = encoder.fit_transform(features_dff[col])

features_dff.head()

## Select numeric columns
num_dff = features_dff.select_dtypes(include='number')
num_cols = list(num_dff.columns)

## Scale the numeric dtypes
for col in num_cols:
    features_dff[col] = scaler.fit_transform(features_dff[[col]])

features_dff.head()

Unnamed: 0,Genetic Markers,Autoantibodies,Family History,Environmental Factors,Insulin Levels,Age,BMI,Physical Activity,Dietary Habits,Blood Pressure,...,Pulmonary Function,Cystic Fibrosis Diagnosis,Steroid Use History,Genetic Testing,Neurological Assessments,Liver Function Tests,Digestive Enzyme Levels,Urine Test,Birth Weight,Early Onset Symptoms
0,0.997118,-0.998344,-1.004812,1.002517,1.705261,0.569277,2.197644,-1.22848,-0.999429,0.634773,...,0.479322,-0.99615,-0.995951,0.99104,1.758207,0.999457,0.494018,-0.451586,-0.655702,-0.998316
1,0.997118,-0.998344,-1.004812,1.002517,-0.798037,-1.474156,-1.294096,-1.22848,-0.999429,-1.922277,...,-0.857855,1.003865,-0.995951,-1.009041,-1.182325,0.999457,-0.949955,-1.345512,-1.703567,1.001687
2,0.997118,1.001659,0.995211,1.002517,0.499969,0.189103,-0.130183,-1.22848,1.000572,0.484358,...,0.813616,1.003865,-0.995951,-1.009041,-1.182325,-1.000543,0.442447,-0.451586,0.735381,1.001687
3,-1.00289,1.001659,-1.004812,1.002517,-1.26161,-1.189026,-1.460369,-0.003535,1.000572,-0.568545,...,1.565777,1.003865,-0.995951,0.99104,0.287941,-1.000543,0.700299,-0.451586,0.62331,-0.998316
4,-1.00289,-0.998344,0.995211,1.002517,-0.427178,-1.046461,-1.294096,-1.22848,-0.999429,-0.41813,...,-2.445751,-0.99615,-0.995951,0.99104,-1.182325,0.999457,-1.156237,1.336266,-1.859066,-0.998316


In [6]:
# Preparing target and features
from sklearn.model_selection import GridSearchCV, train_test_split

X = np.array(features_dff)
y = np.array(target_dff)

## Set random state
SEED = 42

## Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)

In [7]:
# Import model
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=300)

## Fitting the model
rf.fit(X_train, y_train)

In [8]:
## Predict the test set
y_predict = rf.predict(X_test)

In [9]:
## Measure model quality
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_predict)
report = classification_report(y_test, y_predict)

print('The model accuracy is', accuracy)
print(report)

The model accuracy is 0.9046666666666666
                                            precision    recall  f1-score   support

   Cystic Fibrosis-Related Diabetes (CFRD)       0.99      0.91      0.95      1639
                      Gestational Diabetes       0.89      0.93      0.91      1603
                                      LADA       0.96      0.92      0.94      1567
                                      MODY       0.96      0.84      0.90      1666
          Neonatal Diabetes Mellitus (NDM)       1.00      1.00      1.00      1622
                               Prediabetic       0.97      1.00      0.98      1613
                        Secondary Diabetes       0.81      0.79      0.80      1644
                  Steroid-Induced Diabetes       0.83      0.83      0.83      1583
                           Type 1 Diabetes       0.85      1.00      0.92      1634
                           Type 2 Diabetes       0.92      0.70      0.80      1619
Type 3c Diabetes (Pancreatogenic D

In [10]:
## Try other classifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

svc = SVC()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()

In [11]:
## Make a dictionary of model
model_dict = {'svc':svc, 'knn':knn, 'decisiontree':dt}

for keys, item in model_dict.items():
    model_dict[keys].fit(X_train, y_train)
    y_pred = model_dict[keys].predict(X_test)
    # Calculate the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print('The accuracy of model '+keys+' is', accuracy)
    print(report)

The accuracy of model svc is 0.7827142857142857
                                            precision    recall  f1-score   support

   Cystic Fibrosis-Related Diabetes (CFRD)       0.84      0.78      0.81      1639
                      Gestational Diabetes       0.71      0.73      0.72      1603
                                      LADA       0.84      0.80      0.82      1567
                                      MODY       0.78      0.78      0.78      1666
          Neonatal Diabetes Mellitus (NDM)       0.99      1.00      1.00      1622
                               Prediabetic       0.81      0.83      0.82      1613
                        Secondary Diabetes       0.68      0.61      0.64      1644
                  Steroid-Induced Diabetes       0.66      0.59      0.62      1583
                           Type 1 Diabetes       0.82      0.87      0.85      1634
                           Type 2 Diabetes       0.72      0.73      0.73      1619
Type 3c Diabetes (Pancreato