In [1]:
import pandas as pd
import numpy as np
from sklearn import set_config

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from datetime import datetime

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

import mlflow
import mlflow.sklearn

In [2]:
df = pd.read_csv('article.csv')
df.head()

Unnamed: 0,ID,Gender,Ever_Married,Graduated,Spending_Score,Segmentation
0,462809,Male,No,No,Low,D
1,462643,Female,Yes,Yes,Average,A
2,466315,Female,Yes,Yes,Low,B
3,461735,Male,Yes,Yes,High,B
4,462669,Female,Yes,Yes,High,A


In [3]:
class DropFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, variables=None):

        self.variables = variables

    def fit(self, X, y=None):
        return self


    def transform(self, X):
        X = X.copy()
        X = X.drop(self.variables, axis=1)

        return X

    
class Mapper(BaseEstimator, TransformerMixin):

    def __init__(self, variables, mappings):

        self.variables = variables
        self.mappings = mappings

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for feature in self.variables:
            X[feature] = X[feature].map(self.mappings)

        return X    
    

In [4]:
preprocess = Pipeline(
    steps=[("Drop_Columns",
            DropFeatures(
                variables=['ID'])
                ),
           
        ("mapper_segmentation",
            Mapper(
                variables=['Segmentation'],
                mappings={"A": 0, "B": 1, "C": 2, "D": 3},
            )
       ),   
           
        ("simple_one_hot_encode",
             FunctionTransformer(
                pd.get_dummies, kw_args={"drop_first": True, "columns":['Gender','Graduated','Ever_Married','Spending_Score']} )
        )
    ])
processed_df = preprocess.fit_transform(df)
processed_df.head()

Unnamed: 0,Segmentation,Gender_Male,Graduated_Yes,Ever_Married_Yes,Spending_Score_High,Spending_Score_Low
0,3,1,0,0,0,1
1,0,0,1,1,0,0
2,1,0,1,1,0,1
3,1,1,1,1,1,0
4,0,0,1,1,1,0


In [5]:
set_config(display='diagram')
display(preprocess)

In [6]:
X = processed_df.drop(['Segmentation'], axis=1)
y = processed_df['Segmentation']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

In [7]:
pipe_dt = Pipeline([('DT',DecisionTreeClassifier())])
pipe_rf = Pipeline([('RF',RandomForestClassifier())])
pipe_xgb = Pipeline([('XGB', XGBClassifier())])

param_range = [1, 2, 3, 4, 5, 6]
n_estimators = [50,100,150]
learning_rates = [.1,.2,.3]

dt_param_grid = [{'DT__criterion': ['gini', 'entropy'],
                   'DT__max_depth': param_range,
                   'DT__min_samples_split': param_range[1:]}]
rf_param_grid = [{ 'RF__max_depth': param_range,
                   'RF__min_samples_split': param_range[1:]}]
xgb_param_grid = [{'XGB__learning_rate': learning_rates,
                    'XGB__max_depth': param_range,
                    'XGB__n_estimators': n_estimators}]

dt_grid_search = GridSearchCV(estimator=pipe_dt,
        param_grid=dt_param_grid,
        scoring='accuracy',
        cv=3)
rf_grid_search = GridSearchCV(estimator=pipe_rf,
        param_grid=rf_param_grid,
        scoring='accuracy',
        cv=3)
xgb_grid_search = GridSearchCV(estimator=pipe_xgb,
        param_grid=xgb_param_grid,
        scoring='accuracy',
        cv=3)

grids = [dt_grid_search, rf_grid_search, xgb_grid_search]

for pipe in grids:
    pipe.fit(X_train,y_train)

In [8]:
def logging(model,X_test,y_test,run_name,model_name,experiment_name):
    best_param =model.best_params_
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test,y_pred)
    dt = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    run_name = run_name + " - " + dt
    
    print('Run_Name: ',run_name)
    print("Accuracy: ",acc)
    print("Best_Params: ", best_param)
    print("--------------------------------------------------------------")
    
    mlflow.set_experiment(experiment_name)
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(best_param)
        mlflow.log_metric("Accuracy", acc) 
        mlflow.sklearn.log_model(model,model_name)
        
    return None

In [9]:
date = datetime.now().strftime("%d/%m/%Y")
experiment_name = 'Tracking with sklearn pipelines'
experiment_name = experiment_name + ' - ' + date

grid_dict = {0:'Decision Trees', 1: 'Random Forest', 2: 'XGBoost'}

for i, model in enumerate(grids):
    name = grid_dict[i]
    logging(model,X_test,y_test,name,name,experiment_name)
print("Logging Complete")

2023/03/29 12:46:21 INFO mlflow.tracking.fluent: Experiment with name 'Tracking with sklearn pipelines - 29/03/2023' does not exist. Creating a new experiment.


Run_Name:  Decision Trees - 29/03/2023 12:46:21
Accuracy:  0.41843728581220013
Best_Params:  {'DT__criterion': 'gini', 'DT__max_depth': 5, 'DT__min_samples_split': 2}
--------------------------------------------------------------


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Run_Name:  Random Forest - 29/03/2023 12:46:49
Accuracy:  0.4132967786154901
Best_Params:  {'RF__max_depth': 3, 'RF__min_samples_split': 6}
--------------------------------------------------------------
Run_Name:  XGBoost - 29/03/2023 12:47:28
Accuracy:  0.41124057573680606
Best_Params:  {'XGB__learning_rate': 0.2, 'XGB__max_depth': 2, 'XGB__n_estimators': 100}
--------------------------------------------------------------
Logging Complete


In [11]:
mlflow.search_runs(experiment_names=["Tracking with sklearn pipelines - 29/03/2023"])

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.Accuracy,params.XGB__max_depth,params.XGB__learning_rate,params.XGB__n_estimators,params.RF__max_depth,params.RF__min_samples_split,params.DT__max_depth,params.DT__criterion,params.DT__min_samples_split,tags.mlflow.log-model.history,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.source.type,tags.mlflow.source.name
0,00b217236d7d46dab155425aa50e3066,805060117467065374,FINISHED,file:///C:/Users/abdurrahman/Self-Learning/seg...,2023-03-29 07:17:31.128000+00:00,2023-03-29 07:18:06.473000+00:00,0.411241,2.0,0.2,100.0,,,,,,"[{""run_id"": ""00b217236d7d46dab155425aa50e3066""...",abdurrahman,XGBoost - 29/03/2023 12:47:28,LOCAL,C:\Users\abdurrahman\Anaconda3\lib\site-packag...
1,bde8c607add646b9b05505a64488c225,805060117467065374,FINISHED,file:///C:/Users/abdurrahman/Self-Learning/seg...,2023-03-29 07:16:49.360000+00:00,2023-03-29 07:17:24.672000+00:00,0.413297,,,,3.0,6.0,,,,"[{""run_id"": ""bde8c607add646b9b05505a64488c225""...",abdurrahman,Random Forest - 29/03/2023 12:46:49,LOCAL,C:\Users\abdurrahman\Anaconda3\lib\site-packag...
2,686234c1afea42ddbdf88c3880769e78,805060117467065374,FINISHED,file:///C:/Users/abdurrahman/Self-Learning/seg...,2023-03-29 07:16:21.496000+00:00,2023-03-29 07:16:49.159000+00:00,0.418437,,,,,,5.0,gini,2.0,"[{""run_id"": ""686234c1afea42ddbdf88c3880769e78""...",abdurrahman,Decision Trees - 29/03/2023 12:46:21,LOCAL,C:\Users\abdurrahman\Anaconda3\lib\site-packag...
