In [107]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
import xgboost as xgb
from loguru import logger
import mlflow

## Multi Function ML End to End Project

In [87]:
def load_df(file_path):
    try:        
        df = pd.read_csv(file_path)
        logger.info(df.head(2))
        logger.info('Dataframe read!')
        return df
    
    except Exception as e:
        print(e)    
        
def detect_anomalies(data, threshold=3.5):

    mean = np.mean(data)
    std = np.std(data)
    z_scores = (data - mean) / std
    return np.abs(z_scores) > threshold


def clean_df(df):
    
    # remove duplicate
    logger.info(f'Number of duplicates before drop duplicate : {df.duplicated().shape[0]}')
    df.drop_duplicates().reset_index(inplace=True, drop=True)
    logger.info(f'Number of duplicates after drop duplicate : {df.duplicated().shape[0]}')
    
    # change the types
    df[['gender', 'smoking_history']] = df[['gender', 'smoking_history']].astype('category')
    df['age'] = df['age'].astype('int')
    
    # remove NaN
    logger.info(f'Number of NaN before remove: {df.isna().sum()}')
    df.dropna().reset_index(inplace=True, drop=True)
    logger.info(f'Number of NaN after remove: {df.isna().sum()}')
    
    # remove outlier
    mask_bmi = detect_anomalies(df['bmi'], threshold=3.5)
    df = df[~mask_bmi]

    mask_glucose_level = detect_anomalies(df['blood_glucose_level'], threshold=3.5)
    df = df[~mask_glucose_level]
    logger.info('outliers in bmi and blood glucose level is removed!')    
    
    return df
      
    
def preprocess_df(df, feature_selection, target_var):
    """
    Split dataframe into X and y, and train and test consecutively. Then impute and scale both train and test features.
    Returns the train and test sets.
    """
    X = df[feature_selection]
    y = df[target_var]
    
    num_features = list(X.select_dtypes(include='number').columns)
    cat_features = list(X.select_dtypes(exclude='number').columns)

    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(), cat_features),
        ('num', StandardScaler(), num_features)
    ])

    # Split X, y


    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

    # Preprocess the features for both train and test sets
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    return X_train, X_test, y_train, y_test


def model_training(model, x_train, y_train):
                  
    return model.fit(X_train, y_train)
    

def predict_and_evaluate(model, x_test, y_test, return_data=True):
    
    y_pred = model.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    model_name = type(model).__name__
    
    metric_df = pd.DataFrame([{
            'model_name': model_name,
            'f1' : f1,
            'precision': precision,
            'recall': recall,
            'accuracy': accuracy    
        }])
    
    if return_data: 
        
        return metric_df.style.background_gradient(cmap='coolwarm')
    
    else:
        
        return metric_df.drop(columns=['model_name']).to_dict(orient='records')[0]
    
def train_classification_model(params_dict, metrics_dict, model, experiment_name='Default'):

    # Start an MLflow run
    mlflow.set_experiment(experiment_name)
    with mlflow.start_run():

        # Log metrics and parameters with MLflow
        mlflow.log_params(params_dict)
        mlflow.log_metrics(metrics_dict)

        # Save the model as an artifact
        mlflow.sklearn.log_model(model, type(model).__name__)
                

In [108]:
df = load_df('diabetes_prediction_dataset.csv')
df= clean_df(df)
X_train, X_test, y_train, y_test = preprocess_df(df, df.drop(columns=['diabetes']).columns, 'diabetes')
trained_model = model_training(KNeighborsClassifier(), X_train, y_train)
metrics = predict_and_evaluate(trained_model, X_test, y_test, return_data=False)
params = trained_model.get_params()
train_classification_model(params, metrics, trained_model, experiment_name='Default')

2023-08-01 16:25:38.608 | INFO     | __main__:load_df:4 -    gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2023-08-01 16:25:38.615 | INFO     | __main__:load_df:5 - Dataframe read!
2023-08-01 16:25:38.681 | INFO     | __main__:clean_df:15 - Number of duplicates before drop duplicate : 100000
2023-08-01 16:25:38.792 | INFO     | __main__:clean_df:17 - Number of duplicates after drop duplicate : 100000
2023-08-01 16:25:38.841 | INFO     | __main__:clean_df:24 - Number of NaN before remove: gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    

In [111]:
!mlflow ui

[2023-08-01 16:27:29 -0700] [16968] [INFO] Starting gunicorn 20.1.0
[2023-08-01 16:27:29 -0700] [16968] [INFO] Listening at: http://127.0.0.1:5000 (16968)
[2023-08-01 16:27:29 -0700] [16968] [INFO] Using worker: sync
[2023-08-01 16:27:29 -0700] [16969] [INFO] Booting worker with pid: 16969
[2023-08-01 16:27:29 -0700] [16970] [INFO] Booting worker with pid: 16970
[2023-08-01 16:27:29 -0700] [16971] [INFO] Booting worker with pid: 16971
[2023-08-01 16:27:29 -0700] [16972] [INFO] Booting worker with pid: 16972
^C
[2023-08-01 16:29:18 -0700] [16968] [INFO] Handling signal: int
[2023-08-01 16:29:18 -0700] [16970] [INFO] Worker exiting (pid: 16970)
[2023-08-01 16:29:18 -0700] [16969] [INFO] Worker exiting (pid: 16969)
[2023-08-01 16:29:18 -0700] [16971] [INFO] Worker exiting (pid: 16971)
[2023-08-01 16:29:18 -0700] [16972] [INFO] Worker exiting (pid: 16972)
