This module is part of churn prediction, and creates Pandas dataframe for Churn.

In [2]:
%run /Common/config_sandbox

In [3]:
Companies = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/Companies_All")
Accounts = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/Accounts_All")
PaymentInfo = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/PaymentInfo_All")
Licenses = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/Licenses_All")
Trips = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/ByCompany/Trips_SampledByCompany_All")
FormHeader6Month = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/FormHeader6Month")

**Create the churn dataframe for modeling:**
- Find billable Companies.
- Aggregate Licenses to company level.
- Get the churn dataframe by joining billable companies with aggregated licenses.
- TODO: aggregate trips and form submissions to the churn dataframe.

In [5]:
from pyspark.sql.functions import sum, min, max, col, current_date, udf
from pyspark.sql.types import *

BillableCompanies = Companies.join(Accounts, Companies.AccountId == Accounts.AccountId) \
                             .join(PaymentInfo, Companies.AccountId == PaymentInfo.AccountId) \
                             .filter(PaymentInfo.Billable == True) \
                             .filter(Accounts.IsTest == False)

In [6]:
from datetime import datetime

LicensesByCompany = Licenses.groupBy('CompanyId').agg(sum('Count').alias('Licenses'), min('CreatedOn').alias('Activation'), max('ExpirationDate').alias('Expiration'), max('DeactDate').alias('Deactivation')) \
                    .filter('Activation is not null')

# We need a date to determine a company churn or not.
# In theory, it should be today, but sandbox data are too
# old to have no-churn. So we define a pred_datetime from the past.
pred_datetime = datetime(2019, 1, 1)

@udf(returnType=IntegerType())
def isChurned(colDeactivation):
  if colDeactivation is not None and colDeactivation < pred_datetime:
     return 1
  return 0

#isChurned_udf = udf(isChurned, BooleanType())

@udf(returnType=IntegerType())
def licenseDuration(colActivation, colDeactivation):
  if colActivation is not None and colActivation < pred_datetime:
    if colDeactivation is not None and colDeactivation < pred_datetime:
      delta = colDeactivation - colActivation
      return delta.days
    else:
      delta = pred_datetime - colActivation
      return delta.days 
  return -1

#ChurnByCompany = LicensesByCompany.withColumn('Churn', col('Deactivation').isNotNull() & (col('Deactivation') < current_date()))
ChurnByCompany = LicensesByCompany.withColumn('Duration', licenseDuration(col('Activation'), col('Deactivation'))).withColumn('Churn', isChurned(col('Deactivation')))
ChurnByCompanyX = ChurnByCompany.filter(ChurnByCompany.Duration != -1)

In [7]:
BillableCompaniesWithLicenses = BillableCompanies.select(col('CompanyId'), col('CompanyName'), col('Tier'), col('SetupCompletionTime').alias('IsSetupComplete'), \
                                                         col('UserIntegrationType'), col('City'), col('RegionName'), col('PostalCode'), col('CountryCode'), \
                                                         col('CreationDate'), col('ModificationDate'), col('CarrierId'), col('Culture'), col('DynamicsGuid')) \
                                                 .join(ChurnByCompanyX, ChurnByCompanyX.CompanyId == BillableCompanies.CompanyId).drop(ChurnByCompanyX.CompanyId)

BillableCompaniesWithLicenses = BillableCompaniesWithLicenses.withColumn('IsSetupComplete', BillableCompaniesWithLicenses.IsSetupComplete.isNotNull())
display(BillableCompaniesWithLicenses)

CompanyId,CompanyName,Tier,IsSetupComplete,UserIntegrationType,City,RegionName,PostalCode,CountryCode,CreationDate,ModificationDate,CarrierId,Culture,DynamicsGuid,Licenses,Activation,Expiration,Deactivation,Duration,Churn
817,Culinary Stainless Fabricators,20,True,0,Tracy,CA,95376,US,2017-08-08T01:51:16.340+0000,2016-01-20T19:32:23.320+0000,2.0,en-US,83daecde-7477-e611-80f2-00155d002f2e,28,2016-01-20T00:00:00.000+0000,2019-09-09,2019-09-12T00:13:54.490+0000,1077,0
1146,Spicers Lawncare,20,True,0,Preston,Connecticut,06365,US,2017-08-08T01:51:16.340+0000,2016-03-10T21:35:43.440+0000,2.0,en-US,ecf9315d-7577-e611-80f2-00155d002f2e,6,2016-03-03T00:00:00.000+0000,2016-04-04,2016-09-10T13:07:17.520+0000,191,1
1894,Home Media Innovations,30,True,0,Hudson,Minnesota,54016,US,2017-08-08T01:51:16.340+0000,2016-05-26T23:25:47.970+0000,2.0,en-US,a8687760-7677-e611-80f2-00155d002f2e,13,2016-05-12T00:00:00.000+0000,2016-12-14,2016-12-17T00:12:35.040+0000,219,1
2112,Donnamac Enterprises Inc,10,True,0,Largo,Florida,33773,US,2017-08-08T01:51:16.340+0000,2018-12-07T00:46:50.550+0000,2.0,en-US,b634cf9c-7677-e611-80f2-00155d002f2e,7,2016-06-10T00:00:00.000+0000,2019-04-21,2019-04-21T00:14:51.670+0000,935,0
2162,Walker Sand & Stone INC.,30,True,0,King George,Virginia,22485,US,2017-08-08T01:51:16.340+0000,2016-06-16T16:00:55.010+0000,2.0,en-US,142fdaae-7677-e611-80f2-00155d002f2e,10,2016-06-10T00:00:00.000+0000,2016-07-08,2016-09-10T13:12:11.820+0000,92,1
2472,All States Landscaping,20,True,0,Draper,Utah,84020,US,2017-08-08T01:51:16.340+0000,2017-10-10T14:35:57.970+0000,2.0,en-US,e7811bfd-7677-e611-80f2-00155d002f2e,30,2016-07-13T00:00:00.000+0000,2017-01-28,2017-01-31T00:01:59.210+0000,202,1
2648,Finca González,30,True,0,Ensenada,Florida,00647,US,2017-08-08T01:51:16.340+0000,2016-08-10T16:12:41.750+0000,2.0,en-US,0b86e03f-7777-e611-80f2-00155d002f2e,11,2016-08-10T00:00:00.000+0000,2016-11-14,2016-11-17T00:01:13.980+0000,99,1
2657,Critical Intervention Patrol Inc,30,True,0,Napa,California,94559,US,2017-08-08T01:51:16.340+0000,2016-08-11T20:37:44.460+0000,2.0,en-US,97b5e345-7777-e611-80f2-00155d002f2e,6,2016-08-11T00:00:00.000+0000,2016-12-14,2016-12-17T00:12:38.550+0000,128,1
2657,Critical Intervention Patrol Inc,30,True,0,Napa,California,94559,US,2017-08-08T01:51:16.340+0000,2016-08-11T20:37:44.460+0000,2.0,en-US,97b5e345-7777-e611-80f2-00155d002f2e,6,2016-08-11T00:00:00.000+0000,2016-12-14,2016-12-17T00:12:38.550+0000,128,1
2907,Synthecorp LLC,10,True,0,Ironton,,45638,US,2017-08-08T01:51:16.340+0000,2016-12-08T15:21:29.830+0000,2.0,en-US,0072c89e-3177-e611-80f2-00155d002f2e,4,2016-03-03T00:00:00.000+0000,2019-07-13,2019-07-13T00:16:50.210+0000,1034,0


**Define features and target**

In [9]:
categorical_features = ["Tier", "IsSetupComplete", "UserIntegrationType"]
numerical_features = ["Licenses", "Duration"]
target="Churn"

**Churn dataframe in Pandas**

In [11]:
import platform
import pandas as pd
import sklearn
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

# save dataframe as csv
#BillableCompaniesWithLicenses.write.format("csv").save("/mnt/sandboxes/BillX/billable_companies_with_licenses.csv")

# get pandas dataframe
df = BillableCompaniesWithLicenses.toPandas()
