# Import Libreies


In [36]:
!pip install mlflow category_encoders ipynbname openshift minio

You should consider upgrading via the '/opt/app-root/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [80]:
import os
import pandas as pd
import numpy as np
import category_encoders as ce
import joblib
from sklearn.model_selection import train_test_split
import mlflow

import subprocess
import ipynbname

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
import io




# Define the mlflow environments 

In [81]:
HOST = "http://mlflow:5500"

PROJECT_NAME = "CustomerChurn"
EXPERIMENT_NAME = "DecisionTreeClassifierv7"

os.environ['MLFLOW_S3_ENDPOINT_URL']='http://minio-ml-workshop:9000'
os.environ['AWS_ACCESS_KEY_ID']='minio'
os.environ['AWS_SECRET_ACCESS_KEY']='minio123'
os.environ['AWS_REGION']='us-east-1'
os.environ['AWS_BUCKET_NAME']='mlflow'

# Read Data

In [82]:
data = pd.read_csv('../../data/raw/data.csv')


## Check for missing value

In [83]:
# data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

data.replace(" ", np.nan, inplace=True)

data['TotalCharges'] = pd.to_numeric(data['TotalCharges'])

mean = data['TotalCharges'].mean()
data.fillna(mean, inplace=True)

## Do simple encoding 
Convert sting variable into numeric so plotting is easier. We need to later take mean

## Encoding the output

In [85]:

lab_enc = preprocessing.LabelEncoder()
data['Churn'] = lab_enc.fit_transform(data['Churn'])S

0       1
1       1
2       0
3       1
4       0
       ..
7038    1
7039    1
7040    0
7041    0
7042    0
Name: Churn, Length: 7043, dtype: int64

### Apply ordinal encoding for the feature which the order has a meaning

In [69]:

names = [ 'Partner', 'Dependents', 'PhoneService', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']
# for column in names:
#     labelencoder(column)
data_enc = data
data_enc = data_enc.drop(['Churn', 'customerID'], axis=1)
final_set = data_enc
enc = ce.ordinal.OrdinalEncoder(cols=names)
enc.fit(data_enc)
labelled_set = enc.transform(data_enc)
labelled_set.head(5)
joblib.dump(enc, 'ordinalencoder.pkl')  


  elif pd.api.types.is_categorical(cols):


['ordinal_encoder.pkl']

### Apply one hot encoding for the feature which the order has no-meaning

In [70]:
names = ['gender','MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'OnlineSecurity', 'OnlineBackup',
         'DeviceProtection', 'TechSupport']

ohe = ce.OneHotEncoder(cols=names)
data_ohe = data
data_ohe = data_ohe.drop(['Churn', 'customerID'], axis=1)
ohe.fit(data_ohe)
final_set = ohe.transform(labelled_set)

final_set.head(5)

Unnamed: 0,gender_1,gender_2,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines_1,MultipleLines_2,MultipleLines_3,...,Contract_1,Contract_2,Contract_3,PaperlessBilling,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3,PaymentMethod_4,MonthlyCharges,TotalCharges
0,1,0,0,1,1,1,1,1,0,0,...,1,0,0,1,1,0,0,0,45.65,45.65
1,1,0,0,2,2,4,1,0,1,0,...,1,0,0,2,1,0,0,0,101.15,385.9
2,0,1,1,1,1,17,1,1,0,0,...,0,1,0,2,0,1,0,0,20.65,330.6
3,1,0,0,1,1,22,2,0,0,1,...,0,1,0,1,0,0,1,0,43.75,903.6
4,0,1,0,2,2,70,1,1,0,0,...,0,1,0,2,0,0,0,1,74.1,5222.3


## Split data to train and test

In [71]:
labels = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(final_set, labels, test_size=0.2)
print ('Training Data Shape',X_train.shape, y_train.shape)
print ('Testing Data Shape',X_test.shape, y_test.shape)

Y = data['Churn']
X = final_set

Training Data Shape (5634, 37) (5634,)
Testing Data Shape (1409, 37) (1409,)


# Load mlflow to track the model

In [72]:
# from verta.utils import ModelAPI

# Connect to local MLflow tracking server
mlflow.set_tracking_uri(HOST)

# Set the experiment name...
mlflow.set_experiment(EXPERIMENT_NAME)

mlflow.sklearn.autolog(log_input_examples=True)

In [74]:

def get_git_revision_hash():
    return subprocess.check_output(['git', 'rev-parse', 'HEAD'])

def get_git_revision_short_hash():
    return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'])

def get_git_remote():
    return subprocess.check_output(['git', 'config', '--get', 'remote.origin.url'])

def get_git_user():
    return subprocess.check_output(['git', 'config', 'user.name'])

def get_git_branch():
    return subprocess.check_output(['git', 'branch', '--show-current'])

def get_pip_freeze():
    return subprocess.check_output(['pip', 'freeze']).splitlines()


def record_details(mlflow):
    """
    This method is the anchor poijt and more activiteis will go in it
    :param mlflow:
    :return:
    """
    with open("pip_freeze.txt", "wb") as file:
        for line in get_pip_freeze():
            file.write(line)
            file.write(bytes("\n", "UTF-8"))
    mlflow.log_artifact("pip_freeze.txt")

    file.close()
    os.remove("pip_freeze.txt")
    # save the encoders if there is any
    joblib.dump(enc, 'ordinalencoder.pkl')  
    mlflow.log_artifact("ordinalencoder.pkl", artifact_path="model")
    os.remove("ordinalencoder.pkl")
    joblib.dump(ohe, 'onehotencoder.pkl')  
    mlflow.log_artifact("onehotencoder.pkl", artifact_path="model")
    os.remove("onehotencoder.pkl")


def mlflow_grid_search(methodtoexecute, methodarguments):
    with mlflow.start_run(tags= {
        "mlflow.source.git.commit" : get_git_revision_hash() ,
        "mlflow.user": get_git_user(),
        "mlflow.source.git.repoURL": get_git_remote(),
        "git_remote": get_git_remote(),
        "mlflow.source.git.branch": get_git_branch(),
        "mlflow.docker.image.name": os.getenv("JUPYTER_IMAGE", "LOCAL"),
        "mlflow.source.type": "NOTEBOOK",
#         "mlflow.source.name": ipynbname.name()
    }) as run:
        methodtoexecute(**methodarguments)
        record_details(mlflow)

    return run

In [73]:
def fetch_logged_data(run_id):
    client = mlflow.tracking.MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    return data.params, data.metrics, tags, artifacts

# Modeling part

## Define the model

In [75]:
model = DecisionTreeClassifier()

## Define the grid search

In [77]:

grid = {
    'max_depth': [3,5,7,10],
    'criterion': ['gini'],
    'min_samples_leaf': [3, 5] ,
    'min_samples_split':[5,10]
}

grid_search = GridSearchCV(model, grid)

# Train the model

In [78]:
with mlflow.start_run(tags= {
        "mlflow.source.git.commit" : get_git_revision_hash() ,
        "mlflow.user": get_git_user(),
        "mlflow.source.git.repoURL": get_git_remote(),
        "git_remote": get_git_remote(),
        "mlflow.source.git.branch": get_git_branch(),
        "mlflow.docker.image.name": os.getenv("JUPYTER_IMAGE", "LOCAL"),
        "mlflow.source.type": "NOTEBOOK",
#         "mlflow.source.name": ipynbname.name()
    }) as run:
        grid_search.fit(X_train,y_train)
        record_details(mlflow)

2021/10/04 02:24:04 INFO mlflow.sklearn.utils: Logging the 5 best runs, 11 runs will be omitted.


In [18]:
# !pip install mlflow
# !pip install minio
# !pip install boto3
# !pip install scikit-learn==0.24.2
# !pip install openshift-client==1.0.13
# !pip show mlflow
# !pip show minio
# !pip show boto3
# !pip show scikit-learn
# !pip show openshift-client

import os
import mlflow
from minio import Minio
import openshift as oc
from jinja2 import Template

os.environ['MLFLOW_S3_ENDPOINT_URL']='http://minio-ml-workshop:9000'
os.environ['AWS_ACCESS_KEY_ID']='minio'
os.environ['AWS_SECRET_ACCESS_KEY']='minio123'
os.environ['AWS_REGION']='us-east-1'
os.environ['AWS_BUCKET_NAME']='mlflow'
# os.environ['MODEL_NAME'] = 'rossdemo'
# os.environ['MODEL_VERSION'] = '1'
# os.environ['OPENSHIFT_CLIENT_PYTHON_DEFAULT_OC_PATH'] = '/tmp/oc'

HOST = "http://mlflow:5500"

model_name = 'sd9'
model_version = '1'
build_name = f"seldon-model-{model_name}-v{model_version}"

def get_s3_server():
    minioClient = Minio('minio-ml-workshop:9000',
                    access_key='minio',
                    secret_key='minio123',
                    secure=False)

    return minioClient


def init():
    mlflow.set_tracking_uri(HOST)
    print(HOST)
    # Set the experiment name...
    #mlflow_client = mlflow.tracking.MlflowClient(HOST)

    
def download_artifacts():
    print("retrieving model metadata from mlflow...")
    model = mlflow.pyfunc.load_model(
        model_uri=f"models:/{model_name}/{model_version}"
    )
    print(model)
    
    run_id = model.metadata.run_id
    experiment_id = mlflow.get_run(run_id).info.experiment_id
    
    print("initializing connection to s3 server...")
    minioClient = get_s3_server()

#     artifact_location = mlflow.get_experiment_by_name('rossdemo').artifact_location
#     print("downloading artifacts from s3 bucket " + artifact_location)

    data_file_model = minioClient.fget_object("mlflow", f"/{experiment_id}/{run_id}/artifacts/model/model.pkl", "model.pkl")
    data_file_ordinalencoder = minioClient.fget_object("mlflow", f"/{experiment_id}/{run_id}/artifacts/model/ordinalencoder.pkl", "ordinalencoder.pkl")
    data_file_onehotencoder = minioClient.fget_object("mlflow", f"/{experiment_id}/{run_id}/artifacts/model/onehotencoder.pkl", "onehotencoder.pkl")

    data_file_requirements = minioClient.fget_object("mlflow", f"/{experiment_id}/{run_id}/artifacts/model/model.pkl", "requirements.txt")
    #Using boto3 Download the files from mlflow, the file path is in the model meta
    #write the files to the file system
    print("download successful")
    
    return run_id
    
        
init()
run_id = download_artifacts()

http://mlflow:5500
retrieving model metadata from mlflow...
mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.sklearn
  run_id: c030ef299d4244e0ad9e1c1697d6d62d

initializing connection to s3 server...
download successful


In [19]:
import joblib
import numpy as np
import json

class Predictor(object):

    def __init__(self):
        self.model = joblib.load('model.pkl')


    def predict(self, X,features_names):
        # data = request.get("data", {}).get("ndarray")
        # mult_types_array = np.array(data, dtype=object)
        print(X)
        result = self.model.predict(X)
        print(result)

        return json.dumps(result, cls=JsonSerializer)

class JsonSerializer(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (
        np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [20]:
import pandas as pd

class Transformer(object):
    def __init__(self):
        self.ordinalencoder = joblib.load('ordinalencoder.pkl')
        self.onehotencoder = joblib.load('onehotencoder.pkl')
        
    def transform_input(self, request):
        X = request.get("data", {}).get("ndarray")
        feature_names = request.get("data", {}).get("names")
        
        df = pd.DataFrame(X, columns=feature_names)
        df = ordinalencoder.transform(df)
        df = onehotencoder.transform(df)

        #df = df.drop(['customerID'], axis=1)
        return df.to_numpy()


In [21]:
model = joblib.load('model.pkl')
ordinalencoder = joblib.load('ordinalencoder.pkl')
onehotencoder = joblib.load('onehotencoder.pkl')

In [31]:
sample_data = {"data":
  {


        "names":
            [
              "gender","SeniorCitizen","Partner","Dependents","tenure","PhoneService",
              "MultipleLines","InternetService","OnlineSecurity",
              "OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies",
              "Contract",
              "PaperlessBilling","PaymentMethod","MonthlyCharges","TotalCharges"
            ],
    # "ndarray": [["Male", 0, "No", "No", 1, "Yes", "No", "DSL", "No", "No", "No","No", "No", "No", "Month-to-month", "Yes", "Electronic check",45.65, 45.65]]
      # "ndarray": [["Male", 0, "Yes", "Yes", 4, "Yes", "Yes", "Fiber optic", "No", "No", "Yes","No", "Yes", "Yes", "Month-to-month", "No", "Electronic check",101.15, 385.9]]
      "ndarray": [["Male", 0, "No", "No", 32, "Yes", "Yes", "Fiber optic", "Yes", "No", "No","Yes", "No", "No", "Month-to-month", "No", "Bank transfer (automatic)",87.65, 2766.4]]

  }
}

In [32]:
ready_data = Transformer().transform_input(sample_data)

In [33]:
model.predict(ready_data)

array([0])

In [30]:
data.iloc[5]

customerID                               1088
gender                                   Male
SeniorCitizen                               0
Partner                                    No
Dependents                                 No
tenure                                     32
PhoneService                              Yes
MultipleLines                             Yes
InternetService                   Fiber optic
OnlineSecurity                            Yes
OnlineBackup                               No
DeviceProtection                           No
TechSupport                               Yes
StreamingTV                                No
StreamingMovies                            No
Contract                       Month-to-month
PaperlessBilling                           No
PaymentMethod       Bank transfer (automatic)
MonthlyCharges                          87.65
TotalCharges                           2766.4
Churn                                       0
Name: 5, dtype: object