**This notebook predicts Churn for CAB **

**Note:**
- For registered users only
- Prediction outputs spark dataframe with churn probability for each currently active company

Select companies for prediction

In [0]:
from datetime import datetime

# end date for the prediction, typically in sync with the latest data.
pred_datetime = datetime.today()

# start date for license activation
start_dateime = datetime(2000, 1, 1)

# DON'T CHANGE!!! registered users only. 
registered_only = True

Creates input dataframe in Pandas using above criteria

In [0]:
%run ./CAB_Churn_Input

In [0]:
# only pick still active ones to predict
# active_account = billable_account_churn_cache.filter(col('Deactivation').isNull())

active_account = billable_account_churn_cache
active_account = active_account.drop(active_account.CompanyName).drop(active_account.City).drop(active_account.RegionName) \
                               .drop(active_account.PostalCode).drop(active_account.CountryCode).drop(active_account.Culture)

In [0]:
display(active_account)

CompanyId,InstanceId,Tier,IsSetupComplete,UserIntegrationType,AccountId,AccountName,Activation,Expiration,Deactivation,Duration,StandaloneLicenses,AddonLicenses,Churn,Incidents,Trips,Forms,Timekeeping,Orders
872,50,30,True,0,744,TECS ELECTRIC LTD,2016-01-27T00:00:00.000+0000,,,1813,28,15,0,0,0,5,4,690
965,53,10,True,0,21488,Southwest Center,2018-02-26T00:00:00.000+0000,,,1052,17,0,0,10,19994,1,0,0
2917,50,10,True,0,2778,"LADC COMPANIES, INC.",2016-03-10T00:00:00.000+0000,,,1770,10,9,0,0,1709,0,69,2
3354,53,10,True,0,23396,BAYFIELD ENTERPRISES INC,2019-10-02T00:00:00.000+0000,,,469,11,5,0,0,4116,0,0,0
3549,53,30,True,0,23575,Robi Security Service,2020-01-21T00:00:00.000+0000,,,358,5,2,0,0,0,5,11,1
8245,50,20,True,0,6598,Discount Fence USA,2017-06-27T00:00:00.000+0000,,,1296,21,11,0,5,15817,1261,428,40
9872,50,10,True,0,7910,Interstate PDX Shuttles,2017-09-28T00:00:00.000+0000,,,1203,25,0,0,0,36069,0,0,2825
11476,50,30,True,0,9228,Surfrider Foundation,2018-01-30T00:00:00.000+0000,,,1079,19,0,0,0,0,0,0,0
12142,50,10,True,0,10773,Eldredge Tracking,2018-03-08T00:00:00.000+0000,,,1042,10,0,0,1,5138,0,0,0
36507,50,30,True,0,18826,ARMSTRONG MASONRY REPAIR INC,2020-01-31T00:00:00.000+0000,,,348,6,0,0,0,0,303,2,32


Convert spark dataframe to pandas for prediction

In [0]:
df = active_account.toPandas()

In [0]:
df['InstanceId'].value_counts()

Features to include in the final dataframe as input

In [0]:
categorical_features = ["Tier", "UserIntegrationType"]
numerical_features = ["StandaloneLicenses", "AddonLicenses", "Duration", "Incidents", "Trips", "Forms", "Timekeeping", "Orders"]

df_final = df[categorical_features + numerical_features]

**Churn prediction**

In [0]:
import mlflow

# load from model registry
model_name = 'CAB-Churn-Prediction-RegisteredOnly'
model_version = 'Version 10'
# model_name = 'CAB-Churn-Prediction-RegisterOnly-RF'
# model_version = 'Version 2'
stage = 'Production'
loaded_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{stage}")

# Predict
pred = loaded_model.predict(df_final)
# with np.printoptions(precision=2, suppress=True):
#     print(pred)

condition = (pred < 0.5)
noChurn = np.extract(condition, pred) 
print('no churn count is ', len(noChurn))

condition = (pred >= 0.5)
churn = np.extract(condition, pred) 
print('churn count is ', len(churn))
print('proporation of churn data ', len(churn)/len(pred))

**Display churning probability for each company**

In [0]:
from pyspark.sql.types import StructType, StructField, LongType
from pyspark.sql.functions import avg, round, lit

pred_pd = pd.DataFrame(pred, columns=['Churn'])
pred_saprk = spark.createDataFrame(pred_pd, ['Churn'])

def with_column_index(sdf): 
    new_schema = StructType(sdf.schema.fields + [StructField("ColumnIndex", LongType(), False),])
    return sdf.rdd.zipWithIndex().map(lambda row: row[0] + (row[1],)).toDF(schema=new_schema)


df1 = with_column_index(pred_saprk)
df2 = with_column_index(active_account)

df_spark = df1.join(df2, df1.ColumnIndex == df2.ColumnIndex, 'inner').drop(df2.ColumnIndex).drop(df1.ColumnIndex).drop(df2.Churn).drop(df2.IsSetupComplete)

# add model and versino info to output:
df_spark = df_spark.withColumn('ModelVersion', lit(model_name + ': ' + model_version))
# add churn proba column
df_spark = df_spark.withColumn('Churn', round(col('Churn'), 3))

display(df_spark)

Churn,CompanyId,InstanceId,Tier,UserIntegrationType,AccountId,AccountName,Activation,Expiration,Deactivation,Duration,StandaloneLicenses,AddonLicenses,Incidents,Trips,Forms,Timekeeping,Orders,ModelVersion
0.072,872,50,30,0,744,TECS ELECTRIC LTD,2016-01-27T00:00:00.000+0000,,,1813,28,15,0,0,5,4,690,CAB-Churn-Prediction-RegisteredOnly: Version 10
0.174,11476,50,30,0,9228,Surfrider Foundation,2018-01-30T00:00:00.000+0000,,,1079,19,0,0,0,0,0,0,CAB-Churn-Prediction-RegisteredOnly: Version 10
0.055,9872,50,10,0,7910,Interstate PDX Shuttles,2017-09-28T00:00:00.000+0000,,,1203,25,0,0,36069,0,0,2825,CAB-Churn-Prediction-RegisteredOnly: Version 10
0.594,36507,50,30,0,18826,ARMSTRONG MASONRY REPAIR INC,2020-01-31T00:00:00.000+0000,,,348,6,0,0,0,303,2,32,CAB-Churn-Prediction-RegisteredOnly: Version 10
0.072,8245,50,20,0,6598,Discount Fence USA,2017-06-27T00:00:00.000+0000,,,1296,21,11,5,15817,1261,428,40,CAB-Churn-Prediction-RegisteredOnly: Version 10
0.093,965,53,10,0,21488,Southwest Center,2018-02-26T00:00:00.000+0000,,,1052,17,0,10,19994,1,0,0,CAB-Churn-Prediction-RegisteredOnly: Version 10
0.371,3354,53,10,0,23396,BAYFIELD ENTERPRISES INC,2019-10-02T00:00:00.000+0000,,,469,11,5,0,4116,0,0,0,CAB-Churn-Prediction-RegisteredOnly: Version 10
0.111,12142,50,10,0,10773,Eldredge Tracking,2018-03-08T00:00:00.000+0000,,,1042,10,0,1,5138,0,0,0,CAB-Churn-Prediction-RegisteredOnly: Version 10
0.067,2917,50,10,0,2778,"LADC COMPANIES, INC.",2016-03-10T00:00:00.000+0000,,,1770,10,9,0,1709,0,69,2,CAB-Churn-Prediction-RegisteredOnly: Version 10
0.857,3549,53,30,0,23575,Robi Security Service,2020-01-21T00:00:00.000+0000,,,358,5,2,0,0,5,11,1,CAB-Churn-Prediction-RegisteredOnly: Version 10
