### Use logistic regression to analyse customer churn problem

In [3]:
import findspark
findspark.init('/home/bowen/spark-2.4.4-bin-hadoop2.7/')
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('customer_churn').getOrCreate()

In [5]:
fn = './customer_churn.csv'

In [6]:
df = spark.read.csv(fn, inferSchema=True, header=True)

In [10]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [11]:
df.head(1)

[Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.8, Account_Manager=0, Years=7.22, Num_Sites=8.0, Onboard_date=datetime.datetime(2013, 8, 30, 7, 0, 40), Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1)]

In [12]:
df.describe().show()
#all counts are 900, no missing data

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [23]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

In [15]:
vc_assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Years', 'Num_Sites'], outputCol='features')
features_df = vc_assembler.transform(df).select('features', 'churn')
train_set, test_set = features_df.randomSplit([0.7,0.3])

In [50]:
reg_churn = LogisticRegression(labelCol='churn', maxIter=100)

In [51]:
churn_model = reg_churn.fit(train_set)

In [52]:
churn_model.summary.areaUnderROC

0.9069892676290537

In [53]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
results = churn_model.evaluate(test_set)

In [54]:
results.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,4....|    0|[4.35763709722609...|[0.98735336842757...|       0.0|
|[28.0,9090.43,5.7...|    0|[1.17364659021995...|[0.76380352143067...|       0.0|
|[28.0,11245.38,6....|    0|[3.14576167023117...|[0.95874139460064...|       0.0|
|[29.0,9378.24,4.9...|    0|[4.11074465931319...|[0.98386891727647...|       0.0|
|[29.0,12711.15,5....|    0|[4.95467824799308...|[0.99299901103274...|       0.0|
|[30.0,12788.37,4....|    0|[2.00313515427649...|[0.88112585628238...|       0.0|
|[30.0,13473.35,3....|    0|[2.28725205543223...|[0.90781574226732...|       0.0|
|[31.0,5387.75,6.8...|    0|[1.62529269308329...|[0.83552376405049...|       0.0|
|[32.0,5756.12,5.9...|    0|[3.38466741699203...|[0.96722190268630...|       0.0|
|[32.0,6367.22,2

In [55]:
churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='churn')

In [56]:
auc = churn_eval.evaluate(results.predictions, {churn_eval.metricName: "areaUnderROC"})
print (auc)

0.7773613870326892


In [57]:
new_customer_df = spark.read.csv('new_customers.csv', inferSchema=True, header=True)

In [59]:
test_new_customer_features = vc_assembler.transform(new_customer_df)

In [60]:
test_new_customer_features.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [62]:
predicted_results = churn_model.transform(test_new_customer_features)

In [71]:
predicted_results.select('Names', 'Company', 'prediction').show()

+--------------+----------------+----------+
|         Names|         Company|prediction|
+--------------+----------------+----------+
| Andrew Mccall|        King Ltd|       0.0|
|Michele Wright|   Cannon-Benson|       1.0|
|  Jeremy Chang|Barron-Robertson|       1.0|
|Megan Ferguson|   Sexton-Golden|       1.0|
|  Taylor Young|        Wood LLC|       0.0|
| Jessica Drake|   Parks-Robbins|       1.0|
+--------------+----------------+----------+

