# Import Libreies


In [1]:
!pip install mlflow category_encoders ipynbname

Collecting mlflow
  Downloading mlflow-1.20.2-py3-none-any.whl (14.6 MB)
[K     |████████████████████████████████| 14.6 MB 18.5 MB/s eta 0:00:01�███           | 9.6 MB 18.5 MB/s eta 0:00:01 MB 18.5 MB/s eta 0:00:01
[?25hCollecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 74.4 MB/s eta 0:00:01
[?25hCollecting ipynbname
  Downloading ipynbname-2021.3.2-py3-none-any.whl (4.0 kB)
Collecting alembic<=1.4.1
  Downloading alembic-1.4.1.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 89.7 MB/s eta 0:00:01
Collecting gunicorn
  Downloading gunicorn-20.1.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 59.9 MB/s eta 0:00:01
Collecting querystring-parser
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting docker>=4.0.0
  Downloading docker-5.0.2-py2.py3-none-any.whl (145 kB)
[K     |████████████████████████████████| 145 kB 120.6 

In [25]:
import os
import pandas as pd
import numpy as np
import category_encoders as ce
import joblib
from sklearn.model_selection import train_test_split
import mlflow

import subprocess
import ipynbname

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
import io




# Define the mlflow environments 

In [26]:
HOST = "http://mlflow:5500"

PROJECT_NAME = "CustomerChurn"
EXPERIMENT_NAME = "DecisionTreeClassifierv7"

os.environ['MLFLOW_S3_ENDPOINT_URL']='http://minio-ml-workshop:9000'
os.environ['AWS_ACCESS_KEY_ID']='minio'
os.environ['AWS_SECRET_ACCESS_KEY']='minio123'
os.environ['AWS_REGION']='us-east-1'
os.environ['AWS_BUCKET_NAME']='mlflow'

# Read Data

In [27]:
data = pd.read_csv('../../data/raw/data.csv')


## Check for missing value

In [28]:
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

data.replace(" ", np.nan, inplace=True)

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

mean = data['TotalCharges'].mean()
data.fillna(mean, inplace=True)

## Do simple encodding 
Convert binary variable into numeric so plotting is easier. We need to later take mean

### Apply ordinal encoding for the feature which the order has a meaning

In [29]:

names = [ 'Partner', 'Dependents', 'PhoneService', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']
# for column in names:
#     labelencoder(column)
data_enc = data
data_enc = data_enc.drop(['Churn', 'customerID'], axis=1)
enc = ce.ordinal.OrdinalEncoder(cols=names)
enc.fit(data_enc)
labelled_set = enc.transform(data_enc)
labelled_set.head(5)


  elif pd.api.types.is_categorical(cols):


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Male,0,1,1,1,1,No,DSL,No,No,No,No,1,1,Month-to-month,1,Electronic check,45.65,45.65
1,Male,0,2,2,4,1,Yes,Fiber optic,No,No,Yes,No,2,2,Month-to-month,2,Electronic check,101.15,385.9
2,Female,1,1,1,17,1,No,No,No internet service,No internet service,No internet service,No internet service,3,3,One year,2,Mailed check,20.65,330.6
3,Male,0,1,1,22,2,No phone service,DSL,No,Yes,Yes,No,1,2,One year,1,Bank transfer (automatic),43.75,903.6
4,Female,0,2,2,70,1,No,DSL,Yes,Yes,Yes,Yes,1,2,One year,2,Credit card (automatic),74.1,5222.3


### Apply one hot encoding for the feature which the order has no-meaning

In [30]:
names = ['gender','MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'OnlineSecurity', 'OnlineBackup',
         'DeviceProtection', 'TechSupport']

ohe = ce.OneHotEncoder(cols=names)
data_ohe = data
data_ohe = data_ohe.drop(['Churn', 'customerID'], axis=1)
ohe.fit(data_ohe)
final_set = ohe.transform(labelled_set)

final_set.head(5)

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,gender_1,gender_2,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines_1,MultipleLines_2,MultipleLines_3,...,Contract_1,Contract_2,Contract_3,PaperlessBilling,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3,PaymentMethod_4,MonthlyCharges,TotalCharges
0,1,0,0,1,1,1,1,1,0,0,...,1,0,0,1,1,0,0,0,45.65,45.65
1,1,0,0,2,2,4,1,0,1,0,...,1,0,0,2,1,0,0,0,101.15,385.9
2,0,1,1,1,1,17,1,1,0,0,...,0,1,0,2,0,1,0,0,20.65,330.6
3,1,0,0,1,1,22,2,0,0,1,...,0,1,0,1,0,0,1,0,43.75,903.6
4,0,1,0,2,2,70,1,1,0,0,...,0,1,0,2,0,0,0,1,74.1,5222.3


## Split data to train and test

In [31]:
labels = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(final_set, labels, test_size=0.2)
print ('Training Data Shape',X_train.shape, y_train.shape)
print ('Testing Data Shape',X_test.shape, y_test.shape)

Y = data['Churn']
X = final_set

Training Data Shape (5634, 37) (5634,)
Testing Data Shape (1409, 37) (1409,)


# Load mlflow to track the model

In [32]:
# from verta.utils import ModelAPI

# Connect to local MLflow tracking server
mlflow.set_tracking_uri(HOST)

# Set the experiment name...
mlflow.set_experiment(EXPERIMENT_NAME)

mlflow.sklearn.autolog(log_input_examples=True)

In [33]:
def fetch_logged_data(run_id):
    client = mlflow.tracking.MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    return data.params, data.metrics, tags, artifacts

In [34]:
## TODO move it to a library
import subprocess
import ipynbname

def get_git_revision_hash():
    return subprocess.check_output(['git', 'rev-parse', 'HEAD'])

def get_git_revision_short_hash():
    return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'])

def get_git_remote():
    return subprocess.check_output(['git', 'config', '--get', 'remote.origin.url'])

def get_git_user():
    return subprocess.check_output(['git', 'config', 'user.name'])

def get_git_branch():
    return subprocess.check_output(['git', 'branch', '--show-current'])

def get_pip_freeze():
    return subprocess.check_output(['pip', 'freeze']).splitlines()


def record_details(mlflow):
    """
    This method is the anchor poijt and more activiteis will go in it
    :param mlflow:
    :return:
    """
    with open("pip_freeze.txt", "wb") as file:
        for line in get_pip_freeze():
            file.write(line)
            file.write(bytes("\n", "UTF-8"))
    mlflow.log_artifact("pip_freeze.txt")

    file.close()
    os.remove("pip_freeze.txt")


def mlflow_grid_search(methodtoexecute, methodarguments):
    with mlflow.start_run(tags= {
        "mlflow.source.git.commit" : get_git_revision_hash() ,
        "mlflow.user": get_git_user(),
        "mlflow.source.git.repoURL": get_git_remote(),
        "git_remote": get_git_remote(),
        "mlflow.source.git.branch": get_git_branch(),
        "mlflow.docker.image.name": os.getenv("JUPYTER_IMAGE", "LOCAL"),
        "mlflow.source.type": "NOTEBOOK",
#         "mlflow.source.name": ipynbname.name()
    }) as run:
        methodtoexecute(**methodarguments)
        record_details(mlflow)

    return run

# Modeling part

## Define the model

In [35]:
model = DecisionTreeClassifier()

## Encoding the output

In [36]:
lab_enc = preprocessing.LabelEncoder()
training_scores_encoded = lab_enc.fit_transform(y_train)


## Define the grid search

In [37]:

grid = {
    'max_depth': [3,5,7,10],
    'criterion': ['gini'],
    'min_samples_leaf': [3, 5] ,
    'min_samples_split':[5,10]
}

grid_search = GridSearchCV(model, grid)

In [39]:
with mlflow.start_run(tags= {
        "mlflow.source.git.commit" : get_git_revision_hash() ,
        "mlflow.user": get_git_user(),
        "mlflow.source.git.repoURL": get_git_remote(),
        "git_remote": get_git_remote(),
        "mlflow.source.git.branch": get_git_branch(),
        "mlflow.docker.image.name": os.getenv("JUPYTER_IMAGE", "LOCAL"),
        "mlflow.source.type": "NOTEBOOK",
#         "mlflow.source.name": ipynbname.name()
    }) as run:
        grid_search.fit(X_train,y_train)
        record_details(mlflow)

2021/09/30 01:49:57 INFO mlflow.sklearn.utils: Logging the 5 best runs, 11 runs will be omitted.
