## AutoML for Modelling

- Katonic Python SDK for Complete ML Model Life Cycle.
- The Auto ML component in the Katonic SDK can be used to train Machine Learning models with just one or two lines of code.
- All the metrics of classification will get catalogued using SDK.

## Imports

In [1]:
import os
os.system("pip install katonic[ml]==1.6.2")







0

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

from katonic.ml.classification import Classifier

from katonic.log.client import load_model

pd.set_option('display.max_columns', 100)

In [4]:
# define experiment name
exp_name = "teleco_customer_churn3"

### Loading pre-proccessed file

In [5]:
X_df = pd.read_csv("preprocessed_customer_churn.csv")

In [6]:
Y_df = X_df[['customerID','Churn']]
X_df.pop('Churn')
Y_df.head()

Unnamed: 0,customerID,Churn
0,5375,0
1,3962,0
2,2564,1
3,5535,0
4,6511,1


In [7]:
X = X_df
y = Y_df

In [8]:
X.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges'],
      dtype='object')

## Data Splitting

In [9]:
X_train = X.drop(['customerID'], axis=1)
y_train = y['Churn']

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

In [10]:
X_test.to_csv("x_test.csv")
y_test.to_csv("y_test.csv")

In [11]:
X_train.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

## Feature Selection

In [12]:
top = ['tenure',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'TechSupport',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges']

In [13]:
X_train[top]

Unnamed: 0,tenure,InternetService,OnlineSecurity,OnlineBackup,TechSupport,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
2142,21,0,2,0,0,1,0,3,64.85,610
1623,54,1,0,2,0,2,1,0,97.20,4319
6074,1,0,0,0,0,0,1,2,23.45,1940
1362,4,1,0,0,0,0,1,2,70.20,2012
6754,0,0,2,2,2,2,1,0,61.90,0
...,...,...,...,...,...,...,...,...,...,...
3772,1,1,2,0,0,0,1,2,95.00,6440
5191,23,0,2,2,2,2,1,1,91.10,1819
5226,12,2,1,1,1,0,1,2,21.15,2659
5390,12,1,0,0,0,0,1,2,99.45,370


In [14]:
X_train, X_test = X_train[top], X_test[top]


## Modelling

In [15]:
features = list(X_train.columns)
clf = Classifier(X_train, X_test, y_train, y_test, exp_name, source_name='2. modelling.ipynb', features=features)

2023/11/03 21:32:18 INFO mlflow.tracking.fluent: Experiment with name 'teleco_customer_churn3' does not exist. Creating a new experiment.


In [16]:
exp_id = clf.id

print("experiment name : ", clf.name)
print("experiment location : ", clf.location)
print("experiment id : ", clf.id)
print("experiment status : ", clf.stage)

experiment name :  teleco_customer_churn3
experiment location :  s3://models/16
experiment id :  16
experiment status :  active


## Logistic Regression



In [17]:
clf.LogisticRegression()



## GradientBoostingClassifier

In [18]:
clf.GradientBoostingClassifier()

## RandomForestClassifier

In [19]:
clf.RandomForestClassifier(random_state = 42)

## AdaBoostClassifier

In [20]:
clf.AdaBoostClassifier(random_state = 42)

## LightGBMClassifier

In [21]:
clf.LGBMClassifier()

### XGBClassifier

In [22]:
clf.XGBClassifier()



### DecisionTreeClassifier

In [23]:
clf.DecisionTreeClassifier()

### SupportVectorClassifier

In [24]:
clf.SupportVectorClassifier()

### RidgeClassifier

In [25]:
clf.RidgeClassifier()

### KNeighborsClassifier

In [26]:
clf.KNeighborsClassifier()

### GaussianNB

In [27]:
clf.GaussianNB()

## CatBoostClassifier

In [28]:
clf.CatBoostClassifier(random_state=42)

Learning rate set to 0.021554
0:	learn: 0.6786690	total: 56.7ms	remaining: 56.7s
1:	learn: 0.6652349	total: 59ms	remaining: 29.5s
2:	learn: 0.6523005	total: 61.2ms	remaining: 20.3s
3:	learn: 0.6400920	total: 63.3ms	remaining: 15.8s
4:	learn: 0.6282613	total: 65.3ms	remaining: 13s
5:	learn: 0.6168214	total: 67.4ms	remaining: 11.2s
6:	learn: 0.6063354	total: 69.5ms	remaining: 9.86s
7:	learn: 0.5967113	total: 71.5ms	remaining: 8.87s
8:	learn: 0.5876191	total: 73.6ms	remaining: 8.11s
9:	learn: 0.5787825	total: 75.7ms	remaining: 7.49s
10:	learn: 0.5708681	total: 77.8ms	remaining: 6.99s
11:	learn: 0.5629734	total: 79.9ms	remaining: 6.58s
12:	learn: 0.5555358	total: 81.9ms	remaining: 6.22s
13:	learn: 0.5484440	total: 83.9ms	remaining: 5.91s
14:	learn: 0.5417762	total: 86ms	remaining: 5.65s
15:	learn: 0.5354177	total: 123ms	remaining: 7.54s
16:	learn: 0.5299474	total: 125ms	remaining: 7.23s
17:	learn: 0.5244813	total: 127ms	remaining: 6.94s
18:	learn: 0.5189369	total: 129ms	remaining: 6.67s
19

## Get runs

In [29]:
# Select the run of the experiment
df_runs = clf.search_runs(exp_id)
print("Number of runs done : ", len(df_runs))
df_runs.head()

Number of runs done :  12


Unnamed: 0,artifact_uri,end_time,experiment_id,metrics.accuracy_score,metrics.f1_score,metrics.log_loss,metrics.precision_score,metrics.recall,metrics.roc_auc_score,run_id,run_name,start_time,status,tags.data_path,tags.experiment_id,tags.experiment_name,tags.features,tags.mlflow.log-model.history,tags.run_id,tags.version.mlflow
0,s3://models/16/4ddca2e3bf5c4ee8a757c599ae73c92...,2023-11-03 21:32:57.353000+00:00,16,0.806246,0.586989,6.692094,0.673611,0.520107,0.714687,4ddca2e3bf5c4ee8a757c599ae73c920,teleco_customer_churn3_16_cat_boost_classifier,2023-11-03 21:32:49.359000+00:00,FINISHED,-,16,teleco_customer_churn3,"['tenure', 'InternetService', 'OnlineSecurity'...","[{""run_id"": ""4ddca2e3bf5c4ee8a757c599ae73c920""...",4ddca2e3bf5c4ee8a757c599ae73c920,2.0.1
1,s3://models/16/98509b57f43a4efabe844a926311b79...,2023-11-03 21:32:49.332000+00:00,16,0.757984,0.626506,8.359067,0.52963,0.766756,0.760791,98509b57f43a4efabe844a926311b794,teleco_customer_churn3_16_gaussian_NB_classifier,2023-11-03 21:32:47.406000+00:00,FINISHED,-,16,teleco_customer_churn3,"['tenure', 'InternetService', 'OnlineSecurity'...","[{""run_id"": ""98509b57f43a4efabe844a926311b794""...",98509b57f43a4efabe844a926311b794,2.0.1
2,s3://models/16/01927a4ac1b742258b38d2834c60243...,2023-11-03 21:32:47.378000+00:00,16,0.756565,0.474732,8.40802,0.553571,0.41555,0.647447,01927a4ac1b742258b38d2834c60243d,teleco_customer_churn3_16_k_neighbors_classifier,2023-11-03 21:32:45.440000+00:00,FINISHED,-,16,teleco_customer_churn3,"['tenure', 'InternetService', 'OnlineSecurity'...","[{""run_id"": ""01927a4ac1b742258b38d2834c60243d""...",01927a4ac1b742258b38d2834c60243d,2.0.1
3,s3://models/16/d753ba59cbf54d2e83802f051b27ba8...,2023-11-03 21:32:45.414000+00:00,16,0.810504,0.583463,6.545009,0.697761,0.50134,0.711578,d753ba59cbf54d2e83802f051b27ba8a,teleco_customer_churn3_16_ridge_classifier,2023-11-03 21:32:43.345000+00:00,FINISHED,-,16,teleco_customer_churn3,"['tenure', 'InternetService', 'OnlineSecurity'...","[{""run_id"": ""d753ba59cbf54d2e83802f051b27ba8a""...",d753ba59cbf54d2e83802f051b27ba8a,2.0.1
4,s3://models/16/fc33176299f0497287b8a2bb3cf6d52...,2023-11-03 21:32:43.315000+00:00,16,0.735273,0.0,9.143338,0.0,0.0,0.5,fc33176299f0497287b8a2bb3cf6d52c,teleco_customer_churn3_16_svm_classifier,2023-11-03 21:32:40.509000+00:00,FINISHED,-,16,teleco_customer_churn3,"['tenure', 'InternetService', 'OnlineSecurity'...","[{""run_id"": ""fc33176299f0497287b8a2bb3cf6d52c""...",fc33176299f0497287b8a2bb3cf6d52c,2.0.1


In [30]:
df_runs.shape

(12, 20)

## Evaluating  Models

In [31]:
top_runs = df_runs.sort_values(['metrics.roc_auc_score'], ascending=False)
top_runs.head()

Unnamed: 0,artifact_uri,end_time,experiment_id,metrics.accuracy_score,metrics.f1_score,metrics.log_loss,metrics.precision_score,metrics.recall,metrics.roc_auc_score,run_id,run_name,start_time,status,tags.data_path,tags.experiment_id,tags.experiment_name,tags.features,tags.mlflow.log-model.history,tags.run_id,tags.version.mlflow
1,s3://models/16/98509b57f43a4efabe844a926311b79...,2023-11-03 21:32:49.332000+00:00,16,0.757984,0.626506,8.359067,0.52963,0.766756,0.760791,98509b57f43a4efabe844a926311b794,teleco_customer_churn3_16_gaussian_NB_classifier,2023-11-03 21:32:47.406000+00:00,FINISHED,-,16,teleco_customer_churn3,"['tenure', 'InternetService', 'OnlineSecurity'...","[{""run_id"": ""98509b57f43a4efabe844a926311b794""...",98509b57f43a4efabe844a926311b794,2.0.1
11,s3://models/16/15dee6d1aea545aca0bfc2d944369b0...,2023-11-03 21:32:21.914000+00:00,16,0.806955,0.62117,6.667597,0.646377,0.597855,0.740047,15dee6d1aea545aca0bfc2d944369b0b,teleco_customer_churn3_16_logistic_regression,2023-11-03 21:32:18.548000+00:00,FINISHED,-,16,teleco_customer_churn3,"['tenure', 'InternetService', 'OnlineSecurity'...","[{""run_id"": ""15dee6d1aea545aca0bfc2d944369b0b""...",15dee6d1aea545aca0bfc2d944369b0b,2.0.1
8,s3://models/16/24c44ee0e1774639a21ee97db2e7300...,2023-11-03 21:32:29.571000+00:00,16,0.808375,0.610951,6.618564,0.660436,0.568365,0.731576,24c44ee0e1774639a21ee97db2e73004,teleco_customer_churn3_16_ada_boost_classifier,2023-11-03 21:32:27.320000+00:00,FINISHED,-,16,teleco_customer_churn3,"['tenure', 'InternetService', 'OnlineSecurity'...","[{""run_id"": ""24c44ee0e1774639a21ee97db2e73004""...",24c44ee0e1774639a21ee97db2e73004,2.0.1
7,s3://models/16/97aeaf6a07d64cfb8e361ae34b9ed61...,2023-11-03 21:32:34.282000+00:00,16,0.811923,0.602699,6.49599,0.683673,0.538874,0.724553,97aeaf6a07d64cfb8e361ae34b9ed61e,teleco_customer_churn3_16_lgbm_classifier,2023-11-03 21:32:29.600000+00:00,FINISHED,-,16,teleco_customer_churn3,"['tenure', 'InternetService', 'OnlineSecurity'...","[{""run_id"": ""97aeaf6a07d64cfb8e361ae34b9ed61e""...",97aeaf6a07d64cfb8e361ae34b9ed61e,2.0.1
10,s3://models/16/d5fb9dba482843469f45782006fbe98...,2023-11-03 21:32:24.521000+00:00,16,0.811923,0.600302,6.495989,0.686207,0.533512,0.722837,d5fb9dba482843469f45782006fbe984,teleco_customer_churn3_16_gradient_boosting_cl...,2023-11-03 21:32:21.942000+00:00,FINISHED,-,16,teleco_customer_churn3,"['tenure', 'InternetService', 'OnlineSecurity'...","[{""run_id"": ""d5fb9dba482843469f45782006fbe984""...",d5fb9dba482843469f45782006fbe984,2.0.1


## Selecting Best Model

In [32]:
artifacts = top_runs.iloc[0]["artifact_uri"]
run_id = top_runs.iloc[0]["run_id"]
model_name = top_runs.iloc[0]["run_name"] 


print('Best model_artifacts :', artifacts)
print("=" * 100)
print('Best model run_id :', run_id)
print("=" * 100)
print('Best model :', model_name)
print("=" * 100)
print("Best model experiment id :", exp_id)

Best model_artifacts : s3://models/16/98509b57f43a4efabe844a926311b794/artifacts
Best model run_id : 98509b57f43a4efabe844a926311b794
Best model : teleco_customer_churn3_16_gaussian_NB_classifier
Best model experiment id : 16


## Registering Best model

In [33]:
result = clf.register_model(
    run_id=run_id,
    model_name=model_name
)

2023/11/03 21:32:57 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: teleco_customer_churn3_16_gaussian_NB_classifier, version 1


In [34]:
print('Registered model information :')
print('=='*50)
result

Registered model information :


name: "teleco_customer_churn3_16_gaussian_NB_classifier"
version: "1"
creation_timestamp: 1699047177529
last_updated_timestamp: 1699047177529
user_id: ""
current_stage: "None"
description: ""
source: "s3://models/16/98509b57f43a4efabe844a926311b794/artifacts/teleco_customer_churn3_16_gaussian_NB_classifier"
run_id: "98509b57f43a4efabe844a926311b794"
status: READY
run_link: ""

In [35]:
clf.change_stage(
    model_name=model_name,
    ver_list = [1],
    stage='Production'
)

## Fetching the Model

In [36]:
location = f"{artifacts}/{model_name}"

In [37]:
model = load_model(location)

## Predict

In [38]:
y_pred = model.predict(X_test)

In [39]:
# Prepare variable as DataFrame in pandas
df = pd.DataFrame(X_test)

# Add the target variable to df
df["y_pred"] = y_pred

In [40]:
df

Unnamed: 0,tenure,InternetService,OnlineSecurity,OnlineBackup,TechSupport,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,y_pred
185,1,0,0,0,0,0,1,2,24.80,2044,1
2715,41,2,1,1,1,0,1,0,25.25,6522,0
3825,52,2,1,1,1,2,0,3,19.35,67,0
1807,1,1,0,0,0,0,0,2,76.35,5822,1
132,67,0,0,0,2,2,0,0,50.55,2837,0
...,...,...,...,...,...,...,...,...,...,...,...
6366,64,0,0,2,2,2,1,3,68.30,3716,0
315,51,1,2,2,2,1,0,1,110.05,4697,0
2439,17,2,1,1,1,1,0,0,19.90,2856,0
5002,69,0,2,0,0,2,1,1,43.95,2556,0
