In [32]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report


In [27]:
df = pd.read_csv("diabetes_data_clean.csv")

df

X = df.drop("class", axis=1)
y = df['class']



In [28]:
# split data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, stratify = y )

In [33]:
#begin our model training
#start with DummyClassifier to establish baseline
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

In [34]:
confusion_matrix(y_test, dummy_pred)

array([[ 0, 40],
       [ 0, 64]], dtype=int64)

In [31]:
#use a classification report
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.62      1.00      0.76        64

    accuracy                           0.62       104
   macro avg       0.31      0.50      0.38       104
weighted avg       0.38      0.62      0.47       104



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
logr = LogisticRegression(max_iter=10000)
logr.fit(X_train, y_train)
logr_pred = logr.predict(X_test)

In [23]:
confusion_matrix(y_test, logr_pred)

array([[35,  5],
       [ 7, 57]], dtype=int64)

In [24]:
print(classification_report(y_test, logr_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.85        40
           1       0.92      0.89      0.90        64

    accuracy                           0.88       104
   macro avg       0.88      0.88      0.88       104
weighted avg       0.89      0.88      0.89       104



In [35]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)


In [36]:
confusion_matrix(y_test, tree_pred)

array([[39,  1],
       [ 2, 62]], dtype=int64)

In [37]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96        40
           1       0.98      0.97      0.98        64

    accuracy                           0.97       104
   macro avg       0.97      0.97      0.97       104
weighted avg       0.97      0.97      0.97       104



In [38]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

In [39]:
confusion_matrix(y_test, forest_pred)

array([[39,  1],
       [ 0, 64]], dtype=int64)

In [41]:
print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       0.98      1.00      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



In [43]:
forest.feature_importances_

array([0.10572268, 0.08866791, 0.25432557, 0.15827988, 0.06836319,
       0.01879451, 0.02818834, 0.02108447, 0.02396712, 0.02592524,
       0.04286024, 0.02944589, 0.06396284, 0.02172782, 0.03278603,
       0.01589828])

In [44]:
X.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [48]:
pd.DataFrame({'feature': X.columns, 'importance' : forest.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
2,polyuria,0.254326
3,polydipsia,0.15828
0,age,0.105723
1,ismale,0.088668
4,sudden weight loss,0.068363
12,partial paresis,0.063963
10,irritability,0.04286
14,alopecia,0.032786
11,delayed healing,0.029446
6,polyphagia,0.028188


Summary:
1. Train a baseline model
2. Trained three different models - logistic regression, decision tree, random forest
3. Identified the important features in the best performing model