In [1]:
#make sure your path is set to source folder
%cd /Users/arunprakash/Downloads/Python/practise/MLops/Churn-Prediction-Challenge

/Users/arunprakash/Downloads/Python/practise/MLops/Churn-Prediction-Challenge


In [3]:
!pwd

/Users/arunprakash/Downloads/Python/practise/MLops/Churn-Prediction-Challenge


## Loading Data 

### 1.1 Importing packages

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Imported Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scripts import utils
from pycaret.classification import *
# Other Libraries
import mlflow

In [None]:
# Setting up all directory
root_folder = "/home/"
data_directory = root_folder+"data/raw/"
data_profile_path = root_folder+"/data/profile_report/"
intermediate_data_path = root_folder+"data/interim/"
database_path = root_folder+"database/"
print("directory loaded")

 ### 1.2 Reading the merged data

In [None]:
%%time
interim_data = "final_train_data_interim_1660280485.csv" # set the data recieved from the previous notebook
dataset = utils.load_data( [f"{intermediate_data_path}{interim_data}",
                            ]
                         )[0] #since we are only loading single data, we can access it with index 0, since it return multiple dfs in list
dataset.shape

In [None]:
dataset.head()

 ### 1.3 Splitting the data to seen and unseen

In [None]:
# this function is also available in utils.py 
# def get_validation_unseen_set(dataframe, validation_frac=0.05, sample=False, sample_frac=0.1):
#     if not sample:
#         dataset = dataframe.copy()
#     else:
#         dataset = dataframe.sample(frac=sample_frac)
#     data = dataset.sample(frac=(1-validation_frac), random_state=786)
#     data_unseen = dataset.drop(data.index)
#     data.reset_index(inplace=True, drop=True)
#     data_unseen.reset_index(inplace=True, drop=True)
#     return data, data_unseen

In [None]:
data_for_model, data_unseen = utils.get_validation_unseen_set(dataset, validation_frac=0.05, sample=False, sample_frac=0.1)
print('Data for Modeling: ' + str(data_for_model.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
data_for_model.head()

### 2 Setting up the sqlite database

In [None]:
# this function is also available in utils.py 

#create a sqlite db fo storing all the model artifacts etc
# import sqlite3
# from sqlite3 import Error

# def create_sqlit_connection(db_path,db_file):
#     """ create a database connection to a SQLite database """
#     conn = None
#     # opening the conncetion for creating the sqlite db
#     try:
#         conn = sqlite3.connect(db_path+db_file)
#         print(sqlite3.version)
#     # return an error if connection not established
#     except Error as e:
#         print(e)
#     # closing the connection once the database is created
#     finally:
#         if conn:
#             conn.close()

In [None]:
utils.create_sqlit_connection(database_path,r"mlflow_v01.db")

In [None]:
mlflow.set_tracking_uri("http://0.0.0.0:6006")

In [None]:
# do not go ahead unless you execute this step and mlflow is isntalled. 
 
#MAKE mlruns FOLDER on root folder
#run this on terminal where you are on root folder. 
# Makse sure to point the database to correct address. Assuming you have same folder structure you can use this
#mlflow server --backend-store-uri='sqlite:///database/mlflow_v01.db' --default-artifact-root="mlruns/" --port=6006 --host=0.0.0.0

In [None]:
#pip install pycaret==2.3.8

### 3 Setting up Environment: 

The `setup()` function initializes the environment in pycaret and creates the transformation pipeline to prepare the data for modeling and deployment. `setup()`must be called before executing any other function in pycaret. 
* It takes two mandatory parameters: a pandas dataframe and the name of the target column. 
* All other parameters are optional and are used to customize the pre-processing pipeline (we will see them in later tutorials).

When `setup()` is executed, PyCaret's inference algorithm will automatically infer the data types for all features based on certain properties. The data type should be inferred correctly but this is not always the case. To account for this, PyCaret displays a table containing the features and their inferred data types after setup() is executed. If all of the data types are correctly identified enter can be pressed to continue or quit can be typed to end the expriment. Ensuring that the data types are correct is of fundamental importance in PyCaret as it automatically performs a few pre-processing tasks which are imperative to any machine learning experiment. These tasks are performed differently for each data type which means it is very important for them to be correctly configured.

In [None]:
# No Pre-Processing 
Baseline_model_exp01 = setup(data = data_for_model, target = 'is_churn', 
                   session_id = 42,fix_imbalance=False,ignore_features=['msno'],
                   date_features=['registration_init_time','transaction_date','membership_expire_date'],
                   n_jobs=-1,use_gpu=True,
                   log_experiment=True,experiment_name='Baseline_model_exp01',
                   log_plots=True, log_data=True,
                   silent=True, verbose=True,
                   log_profile=False)

In [None]:
# this function is also available in utils.py 
# def get_train_test_set_from_setup():
#     return get_config(variable="X_train"),\
#             get_config(variable="y_train"),\
#             get_config(variable="X_test"),\
#             get_config(variable="y_test")

# def get_x_y_from_setup():
#     return get_config(variable="X"),\
#             get_config(variable="y")

# def get_transformation_pipeline_from_setup():
#     return get_config(variable="prep_pipe")

In [None]:
X_train, y_train, X_test, y_test = utils.get_train_test_set_from_setup()
#you can also get X,y
# X,y = utils.get_x_y_from_setup()
X_train.head()

In [None]:
pipeline = utils.get_transformation_pipeline_from_setup()
pipeline

In [None]:
# p = get_config(variable="prep_pipe")
# p.fit_transform(get_config(variable="data_before_preprocess"))

In [None]:
# models(internal=True)[['Name', 'GPU Enabled']]

### 4 Compare models: 

In [None]:
best_model = compare_models(fold = 5) #exclude=['xgboost']

* Two simple words of code (not even a line) have created over 15 models using 10 fold stratified cross validation and evaluated the 6 most commonly used classification metrics (Accuracy, AUC, Recall, Precision, F1, Kappa). 

* The score grid printed above highlights the highest performing metric for comparison purposes only. The grid by default is sorted using 'Accuracy' (highest to lowest) which can be changed by passing the sort parameter. For example compare_models(sort = 'Recall') will sort the grid by Recall instead of Accuracy. 

* If you want to change the fold parameter from the default value of 10 to a different value then you can use the fold parameter. For example compare_models(fold = 5) will compare all models on 5 fold cross validation. Reducing the number of folds will improve the training time.

In [None]:
#selecting the best model
lgbm  = create_model('lightgbm', fold = 5) 

In [None]:
lgbm

### 5 Analyzing the model performance

5.1 Learning Curve

In [None]:
%matplotlib inline

In [None]:
plot_model(lgbm, plot = 'learning')

5.2 ROC Curve

In [None]:
plot_model(lgbm, plot = 'auc')

5.3 Precision-recall Curve

In [None]:
plot_model(lgbm, plot = 'pr')

5.4 Confusion Matrix

In [None]:
plot_model(lgbm, plot = 'confusion_matrix', plot_kwargs = {'percent' : True})

5.5 Feature Importance

In [None]:
#top 10 features
plot_model(lgbm, plot='feature') #feature_all -> to check for all features 

5.6 Prediction class distribution

In [None]:
plot_model(lgbm, plot='error')

5.7 Model Interpretability

In [None]:
#pip install shap

In [None]:
# interpret model
interpret_model(lgbm)

In [None]:
interpret_model(lgbm,plot='correlation',feature='is_cancel')

In [None]:
interpret_model(lgbm,plot='reason',observation=0) # index of observation in test data

In [None]:
#pip install interpret

In [None]:
interpret_model(lgbm,plot='msa')

5.8 Model Evaluation

In [None]:
predict_model(lgbm, data_unseen);