<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Modelling" data-toc-modified-id="Modelling-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Modelling</a></span></li><li><span><a href="#Model-evaluation" data-toc-modified-id="Model-evaluation-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Model evaluation</a></span></li><li><span><a href="#Pedict-on-new-data" data-toc-modified-id="Pedict-on-new-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Pedict on new data</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('logreg').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


In [None]:
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.feature import OneHotEncoder,OneHotEncoderEstimator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline

In [3]:
# classifiers
from pyspark.ml.classification import LogisticRegression

In [4]:
# cross validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel

In [5]:
# model evaluation
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Load data

In [2]:
!head -2 ../data/customer_churn.csv

Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn
Cameron Williams,42.0,11066.8,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1


In [3]:
data = spark.read.csv('../data/customer_churn.csv', header=True, inferSchema=True)
print(data.count())
data.show()

900
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
| 

In [4]:
data.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [6]:
print(data.columns)

['Names', 'Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites', 'Onboard_date', 'Location', 'Company', 'Churn']


In [13]:
inputCols = ['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites']

assembler = VectorAssembler(inputCols=inputCols,outputCol='features')

In [14]:
output = assembler.transform(data)

In [15]:
final_data = output.select('features','churn')

In [16]:
train, test = final_data.randomSplit([0.7, 0.3])

# Modelling

In [17]:
lr = LogisticRegression(labelCol='churn')
lr_model = lr.fit(train)

In [18]:
train_summary = lr_model.summary
train_summary.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                629|                629|
|   mean|0.16534181240063592|0.12400635930047695|
| stddev| 0.3717844118596229|0.32985111477949913|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



# Model evaluation

In [19]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [20]:
pred_and_labels = lr_model.evaluate(test)

In [21]:
pred_and_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8939.61,0.0...|    0|[6.48038626958281...|[0.99846912883041...|       0.0|
|[28.0,8670.98,0.0...|    0|[7.85875373729749...|[0.99961379414121...|       0.0|
|[28.0,11245.38,0....|    0|[3.82379142207965...|[0.97862217402491...|       0.0|
|[29.0,10203.18,1....|    0|[3.74079377484389...|[0.97681504560612...|       0.0|
|[29.0,11274.46,1....|    0|[4.46811092982229...|[0.98866108450863...|       0.0|
|[30.0,6744.87,0.0...|    0|[3.60100686844134...|[0.97342906141636...|       0.0|
|[30.0,8677.28,1.0...|    0|[4.10335561897855...|[0.98375122652384...|       0.0|
|[30.0,8874.83,0.0...|    0|[3.26331655575499...|[0.96314868717816...|       0.0|
|[31.0,5304.6,0.0,...|    0|[3.51942263672633...|[0.97123537853759...|       0.0|
|[31.0,10182.6,1

In [22]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='churn')

In [24]:
auc = my_eval.evaluate(pred_and_labels.predictions)
auc

0.7734782608695652

# Pedict on new data

In [26]:
lr = LogisticRegression(labelCol='churn')
lr_model = lr.fit(final_data)

In [27]:
new_customers = spark.read.csv('../data/new_customers.csv', header=True, inferSchema=True)
print(new_customers.count())
new_customers.show()

6
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|
|  Jeremy Chang|65.0|         100.0|              1|  1.0|     15.0|2006-12-11 07:48:13|085 Austin Views ...|Barron-Robertson|
|Megan Ferguson|32.0|        6487.5|              0|  9.4|     14.0|2016-10-28 05:32:13|922 Wright Branch...|   Sexton-Golden|
|  Taylor Young|32.0|      13147.71|              1| 10.0|      8.0|2012-03-20 00:36:46|Unit 0789 Box 073...|

In [28]:
new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [30]:
test_new_customers = assembler.transform(new_customers) #  use same assembler from before
test_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [31]:
results = lr_model.transform(test_new_customers)

In [33]:
results.select('Company', 'prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

