Churn prediction
-------------------
Company churn occurs when companies cancel subscriptions or subscriptions expires without renewals.  
This file uses DecisionTree to predict Churn.

In [2]:
%run /Common/config_sandbox

In [3]:
Companies = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/Companies_All")
Accounts = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/Accounts_All")
PaymentInfo = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/PaymentInfo_All")
Licenses = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/Licenses_All")
Trips = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/ByCompany/Trips_SampledByCompany_All")
FormHeader6Month = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/FormHeader6Month")

**Create the churn dataframe for modeling:**
1. Find billable Companies.
2. Aggregate Licenses to company level.
3. Get the churn dataframe by joining billable companies with aggregated licenses.
4. TODO: aggregate trips and form submissions to the churn dataframe.

In [5]:
from pyspark.sql.functions import sum, min, max, col, current_date, udf
from pyspark.sql.types import *

BillableCompanies = Companies.join(Accounts, Companies.AccountId == Accounts.AccountId) \
                             .join(PaymentInfo, Companies.AccountId == PaymentInfo.AccountId) \
                             .filter(PaymentInfo.Billable == True) \
                             .filter(Accounts.IsTest == False)

In [6]:
from datetime import datetime

LicensesByCompany = Licenses.groupBy('CompanyId').agg(sum('Count').alias('Licenses'), min('CreatedOn').alias('Activation'), max('ExpirationDate').alias('Expiration'), max('DeactDate').alias('Deactivation')) \
                    .filter('Activation is not null')

# We need a date to determine a company churn or not.
# In theory, it should be today, but sandbox data are too
# old to have no-churn. So we define a pred_datetime from the past.
pred_datetime = datetime(2019, 1, 1)

@udf(returnType=BooleanType())
def isChurned(colDeactivation):
  if colDeactivation is not None and colDeactivation < pred_datetime:
     return True
  return False

#isChurned_udf = udf(isChurned, BooleanType())

@udf(returnType=IntegerType())
def licenseDuration(colActivation, colDeactivation):
  if colActivation is not None and colActivation < pred_datetime:
    if colDeactivation is not None and colDeactivation < pred_datetime:
      delta = colDeactivation - colActivation
      return delta.days
    else:
      delta = pred_datetime - colActivation
      return delta.days 
  return -1

#ChurnByCompany = LicensesByCompany.withColumn('Churn', col('Deactivation').isNotNull() & (col('Deactivation') < current_date()))
ChurnByCompany = LicensesByCompany.withColumn('Duration', licenseDuration(col('Activation'), col('Deactivation'))).withColumn('Churn', isChurned(col('Deactivation')))
ChurnByCompanyX = ChurnByCompany.filter(ChurnByCompany.Duration != -1)


In [7]:
BillableCompaniesWithLicenses = BillableCompanies.select(col('CompanyId'), col('CompanyName'), col('Tier'), col('SetupCompletionTime').alias('IsSetupComplete'), \
                                                         col('UserIntegrationType'), col('City'), col('RegionName'), col('PostalCode'), col('CountryCode'), \
                                                         col('CreationDate'), col('ModificationDate'), col('CarrierId'), col('Culture'), col('DynamicsGuid')) \
                                                 .join(ChurnByCompanyX, ChurnByCompanyX.CompanyId == BillableCompanies.CompanyId).drop(ChurnByCompanyX.CompanyId)

BillableCompaniesWithLicenses = BillableCompaniesWithLicenses.withColumn('IsSetupComplete', BillableCompaniesWithLicenses.IsSetupComplete.isNotNull())

# redorder columns by moving Last column Churn to the first
#originalColumns = list(BillableCompaniesWithLicenses.columns)
#total = len(originalColumns)
#newColumns = [originalColumns[total-1]] + originalColumns[:total-2]
#BillableCompaniesWithLicenses = BillableCompaniesWithLicenses[newColumns]
                                                 
#display(BillableCompaniesWithLicenses.filter('Churn = false'))
display(BillableCompaniesWithLicenses)

In [8]:
# define features and target
categorical_features = ["Tier", "IsSetupComplete", "UserIntegrationType"]
numerical_features = ["Licenses", "Duration"]
target="Churn"

#display(BillableCompaniesWithLicenses[numerical_features].describe())

In [9]:
import platform
import pandas as pd
import sklearn
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# save dataframe as csv
#BillableCompaniesWithLicenses.write.format("csv").save("/mnt/sandboxes/BillX/billable_companies_with_licenses.csv")

# get pandas dataframe
df = BillableCompaniesWithLicenses.toPandas()
print(df.shape)
print(list(df.columns))
    

In [10]:
df.describe(include='all')

In [11]:
fig, ax = plt.subplots(1, 3, figsize=(14, 4))
for i, categorical_feature in enumerate(categorical_features):
  df[categorical_feature].value_counts().plot(kind='bar', ax=ax[i], rot=0).set_title(categorical_feature)

fig.tight_layout(pad=3.0)

In [12]:
fig, ax = plt.subplots(2, 2, figsize=(14, 6))

df[df.Churn == False]["Licenses"].hist(bins=3, color="blue", alpha=0.5, ax=ax[0, 0]).set_title("Licenses")
df[df.Churn == True]["Licenses"].hist(bins=30, color="red", alpha=0.5, ax=ax[0, 1]).set_title("Licenses")

df[df.Churn == False]["Duration"].hist(bins=30, color="blue", alpha=0.5, ax=ax[1, 0]).set_title("Duration(days)")
df[df.Churn == True]["Duration"].hist(bins=30, color="red", alpha=0.5, ax=ax[1, 1]).set_title("Duration(days)")

fig.tight_layout(pad=3.0)

#display(fig)

In [13]:
feature = 'IsSetupComplete'
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
df[df.Churn == False][feature].value_counts().plot(kind='bar', ax=ax[0], rot=0).set_title('not churned')
df[df.Churn == True][feature].value_counts().plot(kind='bar', ax=ax[1], rot=0).set_title('churned')

**LabelEncoder** for categorical features is used here only for EDA. Later on in pipleline, we will use **OneHot encoder.**    
**LabelEncoder** uses simple numbers and could cause confusion for model (see [the article here](https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621)).

In [15]:
from sklearn.preprocessing import LabelEncoder

categorical_feature_names = []
label_encoders = {}
for categorical in categorical_features + [target]:
    label_encoders[categorical] = LabelEncoder()
    df[categorical] = label_encoders[categorical].fit_transform(df[categorical])
    names = label_encoders[categorical].classes_.tolist()
    print('Label encoder %s - values: %s' % (categorical, names))
    if categorical == target:
        continue
    categorical_feature_names.extend([categorical + '_' + str(name) for name in names])
    

In [16]:
import seaborn as sb

# non numerical columns are being igored by corr(). Use LabeEncoder to convert categorical features 
# to numerical in order for corr().
df_corr = df[categorical_features + [target] + numerical_features]

# print corr matrix
df_corr.corr()


In [17]:
# draw heatmap
sb.heatmap(df_corr.corr(), square=True, cmap='RdYlGn')

In [18]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, df):
        return df[self.key]

pipeline = Pipeline(
    [
        (
            "union",
            FeatureUnion(
                transformer_list=[
                    (
                        "categorical_features",
                        Pipeline(
                            [
                                ("selector", ItemSelector(key=categorical_features)),
                                ("onehot", OneHotEncoder()),
                            ]
                        ),
                    )
                ]
                + [
                    (
                        "numerical_features",
                        Pipeline(
                            [
                                ("selector", ItemSelector(key=numerical_features)),
                                ("scaler", StandardScaler()),
                            ]
                        ),
                    )
                ]
            ),
        ),
        ("classifier", tree.DecisionTreeClassifier(max_depth=5, random_state=42)),
    ]
)

In [19]:
#BillableCompaniesWithLicenses_SampledDF = BillableCompaniesWithLicenses.sample(False, 0.1, 42)
df_sampled = df.sample(replace=False, frac=1, random_state=42)

df_sampled.shape
#display(df_sampled)

In [20]:
# training the model
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_sampled, test_size=0.25, random_state=42)

clf = pipeline.fit(df_train, df_train[target])
pred = pipeline.predict(df_test)

print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(pipeline.score(df_test, df_test[target])))

In [21]:
from sklearn.metrics import classification_report

print(classification_report(df_test[target], pred))

In [22]:
import graphviz

dot_data = tree.export_graphviz(
              pipeline.named_steps['classifier'], 
              out_file=None,
              feature_names = categorical_feature_names + numerical_features,
              class_names=[str(el) for el in pipeline.named_steps['classifier'].classes_],
              filled=True, rounded=True,
              special_characters=True)

graph = graphviz.Source(dot_data)
# print out tree data since graph below fails in databricks.
print(graph.source)

# simply call graph is supposed to render image in jupyter notebook, but doesn't work in databricks.
# call graph.render(filename='/mnt/sandboxes/BillX/churn_tree.gv', view=True) reveals it trying to save
# a copy of file, and then viewing it. In databricks, permission error occurs when saving the file for display.

graph
