In [23]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier

In [11]:
import warnings
warnings.filterwarnings('ignore')

### 1. Set Default Data Repo and load data

In [12]:
a360ai.set_default_datarepo("test")

In [13]:
X = a360ai.load_dataset("X.parquet")
y = a360ai.load_dataset("y.parquet")

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=5)

In [15]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [17]:
my_model = a360ai.get_or_create_model(model_name=f"churn_pred_demo_xgboost_e2e")

In [18]:
experiment = my_model.get_or_create_experiment(
    experiment_name = f"churn_pred_demo_exp_xgboost_e2e",
    model_flavor = "xgboost",
    enable_drift_monitoring=True,
    train_features=X_train,
    train_target=y_train,
    feature_names=list(X.columns),
    data_exploration_file="/home/jovyan/01_exploratory-data-analysis.ipynb",
    data_preparation_file="/home/jovyan/02_data-preprocessing.ipynb",
    model_training_file="/home/jovyan/03_model-training.ipynb",
)


//-- Experiment Loaded --//
Model Name: churn_pred_demo_xgboost_e2e
Experiment Name: churn_pred_demo_exp_xgboost_e2e
Final Run Id: None
Model Flavor: xgboost
Input Signature: ndarray: float64 (26,)
Output Signature: Churn big_integer
Data Exploration File: /home/jovyan/01_exploratory-data-analysis.ipynb
Data Preparation File: /home/jovyan/02_data-preprocessing.ipynb
Model Training File: /home/jovyan/03_model-training.ipynb
Drift Monitoring Enabled: True



In [19]:
experiments = my_model.list_experiments()
experiments

Unnamed: 0,id,bestRun,model_id,experiment_name,best_run_id,model_flavor,input_signature,output_signature,data_exploration_file,data_preparation_file,model_training_file,baseline,train_shape,model_name,updated_at
0,da050819-6515-457e-9c3f-6c2a7dfde964,,94b142f9-1aca-4ff9-878b-562d2159fe61,churn_pred_demo_exp_xgboost_e2e,,xgboost,"ndarray: float64 (26,)",Churn big_integer,/home/jovyan/01_exploratory-data-analysis.ipynb,/home/jovyan/02_data-preprocessing.ipynb,/home/jovyan/03_model-training.ipynb,"{'gender': {'mean': -7.200166389035681e-17, 's...",[26],churn_pred_demo_xgboost_e2e,2022-06-09 07:03


### 2. Model creation

In [20]:
type(y_train)

pandas.core.frame.DataFrame

In [25]:
from sklearn.model_selection import RandomizedSearchCV

params = {
        'min_child_weight': [1, 5],
        'max_depth': [1, 3]
        }

for i in range(3):

    model = XGBClassifier()

    random_search = RandomizedSearchCV(model, param_distributions=params, 
                                       scoring='accuracy', 
                                       verbose=3, cv=3)
    random_search.fit(X_train, y_train.values)
    
    metrics = {'test_score': accuracy_score(y_test,[0 if i <=0.5 else 1for i in random_search.best_estimator_.predict(X_test).reshape(-1,1)]),
            'train_score': accuracy_score(y_train,[0 if i <=0.5 else 1for i in random_search.best_estimator_.predict(X_train).reshape(-1,1)])}
   
    with experiment.run_experiment() as run: 
        run.log_metrics(metrics)
        run.log_model(random_search.best_estimator_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END ...max_depth=1, min_child_weight=1;, score=0.809 total time=   0.9s
[CV 2/3] END ...max_depth=1, min_child_weight=1;, score=0.810 total time=   0.3s
[CV 3/3] END ...max_depth=1, min_child_weight=1;, score=0.805 total time=   0.4s
[CV 1/3] END ...max_depth=1, min_child_weight=5;, score=0.808 total time=   0.3s
[CV 2/3] END ...max_depth=1, min_child_weight=5;, score=0.809 total time=   0.4s
[CV 3/3] END ...max_depth=1, min_child_weight=5;, score=0.805 total time=   0.3s
[CV 1/3] END ...max_depth=3, min_child_weight=1;, score=0.795 total time=   0.7s
[CV 2/3] END ...max_depth=3, min_child_weight=1;, score=0.799 total time=   0.7s
[CV 3/3] END ...max_depth=3, min_child_weight=1;, score=0.798 total time=   0.6s
[CV 1/3] END ...max_depth=3, min_child_weight=5;, score=0.796 total time=   0.8s
[CV 2/3] END ...max_depth=3, min_child_weight=5;, score=0.798 total time=   0.7s
[CV 3/3] END ...max_depth=3, min_child_weight=5;,

### 3. Model training and validation

### Save model binary

In [11]:
# import torch
# X_train = torch.Tensor((X_train))

In [21]:
# import pickle
# pickle.dump(model, open('model_xg.pkl', "wb"))
random_search.best_estimator_.save_model('model_xg_hyper_test.json')

# import joblib
# #save model
# joblib.dump(model, 'model_xg_2.pkl') 

In [68]:
!python3 -m tf2onnx.convert \
        --saved-model ./output/saved_model \
        --output ./output/model.onnx \
        --opset 7

["2022-06-01 13:29:43.646123: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory",
 '2022-06-01 13:29:43.646149: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.',
 '2022-06-01 13:29:45.267067: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected',
 '2022-06-01 13:29:45.267091: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (GS-5460): /proc/driver/nvidia/version does not exist',
 '2022-06-01 13:29:45.267322: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operat

In [26]:
runs = experiment.list_runs()
runs

Unnamed: 0,id,dataset,artifact_paths,metric_test_score,metric_train_score,metadata_run_time
0,c6366f0b-7d73-464a-bb2c-eee83ec3c97c,d5775d14-11be-40aa-bd56-856ebbb0068f,[{'id': '1d30e5ff-e9f9-4660-bfab-cc4acbf5526e'...,0.79602,0.811378,1.4e-05
1,12bc80f9-39fe-4892-bae0-769b5e84ed45,d5775d14-11be-40aa-bd56-856ebbb0068f,[{'id': '4ccb4f14-d0f2-4b14-ad22-c3ed11fb7c22'...,0.79602,0.811378,1.3e-05
2,515da186-9db6-4b84-b534-77b2d6e78318,d5775d14-11be-40aa-bd56-856ebbb0068f,[{'id': '366e52d1-3197-4990-85c1-e5af9bdd3214'...,0.79602,0.811378,1.3e-05


In [27]:
runs.sort_values('metric_test_score',ascending=False)

Unnamed: 0,id,dataset,artifact_paths,metric_test_score,metric_train_score,metadata_run_time
0,c6366f0b-7d73-464a-bb2c-eee83ec3c97c,d5775d14-11be-40aa-bd56-856ebbb0068f,[{'id': '1d30e5ff-e9f9-4660-bfab-cc4acbf5526e'...,0.79602,0.811378,1.4e-05
1,12bc80f9-39fe-4892-bae0-769b5e84ed45,d5775d14-11be-40aa-bd56-856ebbb0068f,[{'id': '4ccb4f14-d0f2-4b14-ad22-c3ed11fb7c22'...,0.79602,0.811378,1.3e-05
2,515da186-9db6-4b84-b534-77b2d6e78318,d5775d14-11be-40aa-bd56-856ebbb0068f,[{'id': '366e52d1-3197-4990-85c1-e5af9bdd3214'...,0.79602,0.811378,1.3e-05


In [28]:
best_run_id = runs.sort_values("metric_test_score", ascending=False).id.values[0]
best_run_id

'c6366f0b-7d73-464a-bb2c-eee83ec3c97c'

In [29]:
my_model.set_final_run(experiment, best_run_id)