# Predict Customer Churn
The notebook ingests, prepares and then trains a model based on dataset that tracks customer activity in an ecommerce store. The goal is to for a given customer, predict whether there will be a churn or not. The model then will be converted to ONNX format and tracked by MLFlow.
We will later use the ONNX model for inferencing in Azure Synapse SQL Pool using the new model scoring wizard.
## Note:
**Please note that this notebook was tested with Scikit-learn version 1.0.2 and Scikit-learn version 0.22.2.post1.**  
If you need to downgrade your scikit-learn, uncomment the line in the below cell and run it. Once it completes running, restart the kernel.


## Importing libraries and setting up workspace

In [1]:
import azureml.core
from azureml.core import Experiment, Workspace, Dataset, Datastore
from azureml.train.automl import AutoMLConfig
from azureml.data.dataset_factory import TabularDatasetFactory

In [2]:
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
from azure.storage.blob import ContainerClient, BlobClient
import pandas as pd
from io import BytesIO
from copy import deepcopy
import GlobalVariables as gv

In [3]:
from azureml.core import LinkedService
import datetime  
from azureml.core import Workspace, LinkedService, SynapseWorkspaceLinkedServiceConfiguration

# Azure Machine Learning workspace
ws = Workspace.from_config()

linked_service = LinkedService.get(ws, 'synapselink1')

## Load data from datastore

In [5]:
dataset = Dataset.get_by_name(workspace=ws, name='customer_churn_dataset', version='latest')

df = dataset.to_pandas_dataframe()

## Pre-Processing

In [6]:
df["Price"] = pd.to_numeric(df["Price"])
df["Quantity"] = pd.to_numeric(df["Quantity"])
df["Segment"] = pd.to_numeric(df["Segment"])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354345 entries, 0 to 354344
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Segment    354345 non-null  int64  
 1   Quantity   354345 non-null  int64  
 2   Price      354345 non-null  float64
 3   StockCode  354345 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 10.8+ MB


In [8]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.3, random_state=123)

x_train = pd.DataFrame(train_df.drop(['Segment'], axis = 1))
y_train = pd.DataFrame(train_df.iloc[:,train_df.columns.tolist().index('Segment')])

x_test = pd.DataFrame(test_df.drop(['Segment'], axis = 1))
y_test = pd.DataFrame(test_df.iloc[:,test_df.columns.tolist().index('Segment')])

## Save test dataset to be used in synapse

In [9]:
test_df.to_csv('customer_churn_test_data.csv', index=False)

## Train Logistic Regression model

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

float_features = ['Price']
float_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

integer_features = ['Quantity']
integer_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['StockCode']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('float', float_transformer, float_features),
        ('integer', integer_transformer, integer_features),
        ('cat', categorical_transformer, categorical_features)
    ])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

# Train the model
clf.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('float',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                           

In [11]:
# Evalute the model
score = clf.score(x_test, y_test)
print(score)

0.5542971101745936


## Convert the model to ONNX format
Currently, T-SQL scoring only supports ONNX model format (https://onnx.ai/).

In [12]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType, DoubleTensorType, StringTensorType

def convert_dataframe_schema(df, drop=None):
    inputs = []
    for k, v in zip(df.columns, df.dtypes):
        if drop is not None and k in drop:
            continue
        if v == 'int64':
            t = Int64TensorType([1, 1])
        elif v == 'float32':
            t = FloatTensorType([1, 1])
        elif v == 'float64':
            t = FloatTensorType([1, 1])
        else:
            t = StringTensorType([1, 1])
        inputs.append((k, t))
    return inputs

model_inputs = convert_dataframe_schema(x_train)
onnx_model = convert_sklearn(clf, "customer_churn_predict", model_inputs)

The maximum opset needed by this model is only 11.
The maximum opset needed by this model is only 1.


In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

## Register the model with MLFlow

In [None]:
import mlflow
import mlflow.onnx

from mlflow.models.signature import infer_signature

experiment_name = 'customer_churn_predict_exp'
artifact_path = 'ncustomer_churn_predict_artifact'

mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)

with mlflow.start_run() as run:
    # Infer signature
    input_sample = x_train.head(1)
    output_sample = pd.DataFrame(columns=['output_label'], data=[1])
    signature = infer_signature(input_sample, output_sample)

    # Save the model to the outputs directory for capture
    mlflow.onnx.log_model(onnx_model, artifact_path, signature=signature, input_example=input_sample)

    # Register the model to AML model registry
    mlflow.register_model('runs:/' + run.info.run_id + '/' + artifact_path, 'customer_churn_predict')
