### Part III: Machine Learning Model Training

In [3]:
# import libraries
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report


In [6]:
# prepare our indenpendent and dependent variables
df = pd.read_csv("diabetes_data_clean.csv")

x = df.drop('class', axis=1)
y = df['class']
y


0      1
1      1
2      1
3      1
4      1
      ..
515    1
516    1
517    1
518    0
519    0
Name: class, Length: 520, dtype: int64

In [13]:
# split data intro train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,
                                                   stratify = y)

In [14]:
# begin our model training
# start with DummyClussifier to establish baseline
dummy = DummyClassifier()
dummy.fit(x_train, y_train)
dummy_pred = dummy.predict(x_test)

In [15]:
# assess DummyClassifier model
confusion_matrix(y_test, dummy_pred)

array([[ 0, 40],
       [ 0, 64]], dtype=int64)

            prediction
             0     1
Actual   0   TN   FP
         1   FN   TP
         
         TN = true negatif -->dont have diab, pred dont have dia
         TP = tru positif -->have dia pred have dia
         FN = false negatif -->have dia, pred dont have dia
         FP = false positif --> dont have dia, pred have dia

In [16]:
# use a classisfication report
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.62      1.00      0.76        64

    accuracy                           0.62       104
   macro avg       0.31      0.50      0.38       104
weighted avg       0.38      0.62      0.47       104



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# start with logistic Regression
logr = LogisticRegression(max_iter=10000) 
logr.fit(x_train, y_train)
logr_pred = logr.predict(x_test)

In [19]:
confusion_matrix(y_test, logr_pred)

array([[38,  2],
       [ 5, 59]], dtype=int64)

In [21]:
print(classification_report(y_test, logr_pred))

              precision    recall  f1-score   support

           0       0.88      0.95      0.92        40
           1       0.97      0.92      0.94        64

    accuracy                           0.93       104
   macro avg       0.93      0.94      0.93       104
weighted avg       0.94      0.93      0.93       104



In [22]:
# try DecisionTree
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)
tree_pred = tree.predict(x_test)

In [23]:
confusion_matrix(y_test, tree_pred)

array([[40,  0],
       [ 1, 63]], dtype=int64)

In [25]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        40
           1       1.00      0.98      0.99        64

    accuracy                           0.99       104
   macro avg       0.99      0.99      0.99       104
weighted avg       0.99      0.99      0.99       104



In [27]:
# try randomforest
forest = RandomForestClassifier()
forest.fit(x_train, y_train)
forest_pred = forest.predict(x_test)

In [28]:
confusion_matrix(y_test, forest_pred)

array([[40,  0],
       [ 0, 64]], dtype=int64)

In [29]:
print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       1.00      1.00      1.00        64

    accuracy                           1.00       104
   macro avg       1.00      1.00      1.00       104
weighted avg       1.00      1.00      1.00       104



In [30]:
forest.feature_importances_

array([0.10191357, 0.09394726, 0.18453047, 0.23537194, 0.0616476 ,
       0.02390142, 0.02665853, 0.0216749 , 0.02806167, 0.032605  ,
       0.03573726, 0.03285456, 0.04456377, 0.02084537, 0.03545436,
       0.02023233])

In [31]:
x.columns

Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [35]:
pd.DataFrame({'feature': x.columns,
             'importance': forest.feature_importances_}).sort_values('importance',
                                                                    ascending=False)

Unnamed: 0,feature,importance
3,polydipsia,0.235372
2,polyuria,0.18453
0,age,0.101914
1,ismale,0.093947
4,sudden weight loss,0.061648
12,partial paresis,0.044564
10,irritability,0.035737
14,alopecia,0.035454
11,delayed healing,0.032855
9,itching,0.032605


Summary
1. trained a baseline model
2. trained three different model- logistic regression, decision tree, random forest
3. identified the important features in the performing model