# Data Cleaning

In [61]:
import mlflow
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv("../data/diabetes_prediction_dataset.csv")

In [9]:
data.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


In [6]:
data.isnull().sum().sum()


0

In [8]:
data.dtypes

gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
diabetes                 int64
dtype: object

## One hot Encode categorical variables

In [13]:
data_encoded = pd.get_dummies(data, columns=['gender', 'smoking_history'], drop_first=True)

In [14]:
data_encoded.head(10)

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,False,False,False,False,False,True,False
1,54.0,0,0,27.32,6.6,80,0,False,False,False,False,False,False,False
2,28.0,0,0,27.32,5.7,158,0,True,False,False,False,False,True,False
3,36.0,0,0,23.45,5.0,155,0,False,False,True,False,False,False,False
4,76.0,1,1,20.14,4.8,155,0,True,False,True,False,False,False,False
5,20.0,0,0,27.32,6.6,85,0,False,False,False,False,False,True,False
6,44.0,0,0,19.31,6.5,200,1,False,False,False,False,False,True,False
7,79.0,0,0,23.86,5.7,85,0,False,False,False,False,False,False,False
8,42.0,0,0,33.64,4.8,145,0,True,False,False,False,False,True,False
9,32.0,0,0,27.32,5.0,100,0,False,False,False,False,False,True,False


In [21]:
from sklearn.model_selection import train_test_split

In [18]:
X = data_encoded.drop('diabetes', axis=1)
y = data_encoded['diabetes']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=39)

In [63]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('demo-experiment')

2024/09/02 11:36:50 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/09/02 11:36:50 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/Users/bwhom/Desktop/Labs/labs/mlruns/1', creation_time=1725302210897, experiment_id='1', last_update_time=1725302210897, lifecycle_stage='active', name='demo-experiment', tags={}>

In [64]:
X_train_file = "saved_data/X_train.csv"
X_test_file = "saved_data/X_test.csv"
y_train_file = "saved_data/y_train.csv"
y_test_file = "saved_data/y_test.csv"

In [65]:
pd.DataFrame(X_train).to_csv(X_train_file, index=False)
pd.DataFrame(X_test).to_csv(X_test_file, index=False)
pd.DataFrame(y_train).to_csv(y_train_file, index=False)
pd.DataFrame(y_test).to_csv(y_test_file, index=False)

## Logistic Regression

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [73]:
with mlflow.start_run():

    mlflow.log_artifact(X_train_file)
    mlflow.log_artifact(X_test_file)
    mlflow.log_artifact(y_train_file)
    mlflow.log_artifact(y_test_file)

    mlflow.set_tags({"Model":"Logistic Regression", "Train Data": "All Features"})

    model = LogisticRegression(max_iter=1000, random_state=39)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
mlflow.end_run()




## Decision Trees

In [24]:
from sklearn.tree import DecisionTreeClassifier

In [34]:
max_depths = [None, 10, 20, 30]
min_samples_splits = [2, 5, 10]
min_samples_leaves = [1, 2, 4]


In [74]:
for max_depth in max_depths:
    for min_samples_split in min_samples_splits:
        for min_samples_leaf in min_samples_leaves:
            with mlflow.start_run():

                mlflow.set_tags({"Model":"Decision Tree", "Train Data": "All Features"})

                dt = DecisionTreeClassifier(max_depth=max_depth,
                                            min_samples_split=min_samples_split,
                                            min_samples_leaf=min_samples_leaf,
                                            random_state=39)

                dt.fit(X_train, y_train)

                y_pred = dt.predict(X_test)

                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)

                mlflow.log_params({
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf
                })

                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric('precision', precision)
                mlflow.log_metric('recall', recall)

                mlflow.log_artifact(X_train_file)
                mlflow.log_artifact(X_test_file)
                mlflow.log_artifact(y_train_file)
                mlflow.log_artifact(y_test_file)

        mlflow.end_run()


## Random Forest 

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [70]:

n_estimators = [50, 100, 150]
max_depths = [10, 20]
min_samples_splits = [2, 5, 10]

In [75]:
for n_estimator in n_estimators:
    for max_depth in max_depths:
        for min_samples_split in min_samples_splits:
            with mlflow.start_run():

                mlflow.set_tags({"Model":"Random Forest", "Train Data": "All Features"})

                rf = RandomForestClassifier(n_estimators=n_estimator,
                                            max_depth=max_depth,
                                            min_samples_split=min_samples_split,
                                            random_state=39)
                rf.fit(X_train, y_train)

                y_pred = rf.predict(X_test)

                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)

                mlflow.log_params({
                    'n_estimators': n_estimator,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split
                })

                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric('precision', precision)
                mlflow.log_metric('recall', recall)

                mlflow.log_artifact(X_train_file)
                mlflow.log_artifact(X_test_file)
                mlflow.log_artifact(y_train_file)
                mlflow.log_artifact(y_test_file)

        mlflow.end_run()


# Feature Selection

In [41]:
model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Get the indices of features sorted by importance
indices = np.argsort(importances)[::-1]

In [43]:
indices

array([ 4,  5,  3,  0,  1,  2,  6, 11, 10,  8, 12,  9,  7])

In [45]:
top = indices[:6]

In [55]:
X_train_subset = X_train.iloc[:, top]
X_test_subset = X_test.iloc[:, top]

In [76]:
X_train_subset_file = "saved_data/X_train_subset.csv"
X_test_subset_file = "saved_data/X_test_subset.csv"

In [78]:
pd.DataFrame(X_train_subset).to_csv(X_train_subset_file, index=False)
pd.DataFrame(X_test_subset).to_csv(X_test_subset_file, index=False)

## Logistic Regression Subsetted Data

In [79]:
with mlflow.start_run():

    mlflow.log_artifact(X_train_subset_file)
    mlflow.log_artifact(X_test_subset_file)
    mlflow.log_artifact(y_train_file)
    mlflow.log_artifact(y_test_file)

    mlflow.set_tags({"Model":"Logistic Regression", "Train Data": "Subset Features"})

    model = LogisticRegression(max_iter=1000, random_state=39)
    model.fit(X_train_subset, y_train)

    y_pred = model.predict(X_test_subset)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
mlflow.end_run()


## Decision Tree Subsetted Data

In [80]:
for max_depth in max_depths:
    for min_samples_split in min_samples_splits:
        for min_samples_leaf in min_samples_leaves:
            with mlflow.start_run():

                mlflow.set_tags({"Model":"Decision Tree", "Train Data": "Subset Features"})

                dt = DecisionTreeClassifier(max_depth=max_depth,
                                            min_samples_split=min_samples_split,
                                            min_samples_leaf=min_samples_leaf,
                                            random_state=39)

                dt.fit(X_train_subset, y_train)

                y_pred = dt.predict(X_test_subset)

                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)

                mlflow.log_params({
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf
                })

                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric('precision', precision)
                mlflow.log_metric('recall', recall)

                mlflow.log_artifact(X_train_subset_file)
                mlflow.log_artifact(X_test_subset_file)
                mlflow.log_artifact(y_train_file)
                mlflow.log_artifact(y_test_file)

        mlflow.end_run()

## Random Forest Subsetted Data

In [81]:
for n_estimator in n_estimators:
    for max_depth in max_depths:
        for min_samples_split in min_samples_splits:
            with mlflow.start_run():

                mlflow.set_tags({"Model":"Random Forest", "Train Data": "Subset Features"})

                rf = RandomForestClassifier(n_estimators=n_estimator,
                                            max_depth=max_depth,
                                            min_samples_split=min_samples_split,
                                            random_state=39)
                rf.fit(X_train_subset, y_train)

                y_pred = rf.predict(X_test_subset)

                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred)
                recall = recall_score(y_test, y_pred)

                mlflow.log_params({
                    'n_estimators': n_estimator,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split
                })

                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric('precision', precision)
                mlflow.log_metric('recall', recall)

                mlflow.log_artifact(X_train_subset_file)
                mlflow.log_artifact(X_test_subset_file)
                mlflow.log_artifact(y_train_file)
                mlflow.log_artifact(y_test_file)

        mlflow.end_run()


## Three best model run IDS

* f283c99e6ad748448e9c39303253c00d
* a303e9b961204ee59179bddcbbb893c0
* d07256f6d2e749658aa5989099c4c432

In [87]:

best_model = DecisionTreeClassifier(max_depth=20,
                            min_samples_split=4,
                            min_samples_leaf=2,
                            random_state=39)

best_model.fit(X_train_subset, y_train)

y_pred = best_model.predict(X_test_subset)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

accuracy, precision, recall

(0.96515, 0.842847075405215, 0.7161676646706587)

In [88]:
mlflow.sklearn.save_model(best_model, "best_model")

In [89]:
mlflow.register_model(f"file://best_model", "best_classification_model")

Successfully registered model 'best_classification_model'.
Created version '1' of model 'best_classification_model'.


<ModelVersion: aliases=[], creation_timestamp=1725307767017, current_stage='None', description=None, last_updated_timestamp=1725307767017, name='best_classification_model', run_id=None, run_link=None, source='file://best_model', status='READY', status_message=None, tags={}, user_id=None, version=1>