In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
df_churn = spark.read.csv(path='data/customer_churn.csv', inferSchema=True, header=True)

In [5]:
df_churn.show()

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|      Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|     

In [7]:
df_churn.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [8]:
df_churn.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'], 
                            outputCol='features')

In [11]:
df_churn_vec = assembler.transform(df_churn)

In [12]:
df_churn_vec.show()

+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+
|              Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|            features|
+-------------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+--------------------+
|   Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|[42.0,11066.8,0.0...|
|      Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|[41.0,11916.22,0....|
|        Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|[38.0,12884.75,0....|
|      Phillip White|4

In [13]:
df_churn_final = df_churn_vec.select(['features', 'churn'])
df_churn_final.show(2)

+--------------------+-----+
|            features|churn|
+--------------------+-----+
|[42.0,11066.8,0.0...|    1|
|[41.0,11916.22,0....|    1|
+--------------------+-----+
only showing top 2 rows



In [14]:
df_churn_final_train, df_churn_final_test = df_churn_final.randomSplit([0.7, 0.3])

In [15]:
from pyspark.ml.classification import LogisticRegression

In [16]:
log_regressor = LogisticRegression(featuresCol='features', labelCol='churn')

In [17]:
mod_lr = log_regressor.fit(df_churn_final_train)

In [18]:
mod_lr_summary = mod_lr.summary

In [19]:
mod_lr_summary.predictions.show()



+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[22.0,11254.38,1....|  0.0|[4.78685456652763...|[0.99173031315399...|       0.0|
|[25.0,9672.03,0.0...|  0.0|[4.88455957706289...|[0.99249430774345...|       0.0|
|[26.0,8787.39,1.0...|  1.0|[0.50296970433128...|[0.62315696852276...|       0.0|
|[27.0,8628.8,1.0,...|  0.0|[5.44320168321928...|[0.99569301849240...|       0.0|
|[28.0,8670.98,0.0...|  0.0|[8.1586143187073,...|[0.99971382313709...|       0.0|
|[28.0,11128.95,1....|  0.0|[4.29558459950131...|[0.98655463937594...|       0.0|
|[28.0,11245.38,0....|  0.0|[3.87901296853242...|[0.97974742709614...|       0.0|
|[29.0,5900.78,1.0...|  0.0|[3.88125950912930...|[0.97979195585662...|       0.0|
|[29.0,8688.17,1.0...|  1.0|[2.58412391255652...|[0.92983280606077...|       0.0|
|[29.0,9378.24,0

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [21]:
eval_mod_lr = mod_lr.evaluate(df_churn_final_test)

In [22]:
eval_mod_lr.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8939.61,0.0...|    0|[6.68611172524689...|[0.99875342946903...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.39376514084221...|[0.80119264620938...|       0.0|
|[28.0,11204.23,0....|    0|[2.24507300447382...|[0.90422469445262...|       0.0|
|[29.0,10203.18,1....|    0|[3.74970546961188...|[0.97701601712736...|       0.0|
|[29.0,12711.15,0....|    0|[5.72332583049312...|[0.99674182911982...|       0.0|
|[29.0,13255.05,1....|    0|[4.40902347515770...|[0.98797920371604...|       0.0|
|[30.0,7960.64,1.0...|    1|[3.29446242173928...|[0.96423834989080...|       0.0|
|[30.0,8677.28,1.0...|    0|[3.89527023312936...|[0.98006750687014...|       0.0|
|[30.0,8874.83,0.0...|    0|[3.27973347571876...|[0.96372696780771...|       0.0|
|[30.0,10183.98,



In [23]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='churn')

In [24]:
auc_metric = evaluator.evaluate(eval_mod_lr.predictions)



In [25]:
auc_metric

0.735542040119505

### Predict on New Data

In [27]:
mod_lr_full = log_regressor.fit(df_churn_final)

In [30]:
df_new_customers = spark.read.csv('data/new_customers.csv', inferSchema=True, header=True)

In [31]:
df_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [32]:
df_new_customers_vec = assembler.transform(df_new_customers)

In [33]:
df_new_customers_vec.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [34]:
df_new_customers_results = mod_lr_full.transform(df_new_customers_vec)

In [35]:
df_new_customers_results.show()

+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+--------------------+--------------------+----------+
|         Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|         Company|            features|       rawPrediction|         probability|prediction|
+--------------+----+--------------+---------------+-----+---------+-------------------+--------------------+----------------+--------------------+--------------------+--------------------+----------+
| Andrew Mccall|37.0|       9935.53|              1| 7.71|      8.0|2011-08-29 18:37:54|38612 Johnny Stra...|        King Ltd|[37.0,9935.53,1.0...|[2.22168680572547...|[0.90218015921764...|       0.0|
|Michele Wright|23.0|       7526.94|              1| 9.28|     15.0|2013-07-22 18:19:54|21083 Nicole Junc...|   Cannon-Benson|[23.0,7526.94,1.0...|[-6.2207539991844...|[0.00198380259784...|       

In [36]:
df_new_customers_results.select('Company', 'prediction').show()

+----------------+----------+
|         Company|prediction|
+----------------+----------+
|        King Ltd|       0.0|
|   Cannon-Benson|       1.0|
|Barron-Robertson|       1.0|
|   Sexton-Golden|       1.0|
|        Wood LLC|       0.0|
|   Parks-Robbins|       1.0|
+----------------+----------+

