## Logistic Regression with PySpark: Customer Churn

In [1]:
from pyspark.sql import SparkSession

In [11]:
spark = SparkSession.builder \
                    .appName("churn") \
                    .getOrCreate()

In [12]:
df = spark.read.csv("customer_churn.csv", inferSchema=True, header=True)

In [13]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [14]:
df.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         NULL|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                NULL|                NULL|0.16666666666666666|
| stddev|         NULL|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [7]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [8]:
from pyspark.ml.feature import VectorAssembler

In [15]:
assembler = VectorAssembler(inputCols=[
    'Total_Purchase',
    'Account_Manager',
    'Years',
    'Num_Sites' 
],
outputCol="features")

In [16]:
output = assembler.transform(df)

In [17]:
df_final = output.select("features", "churn")

In [18]:
df_final.show()

+--------------------+-----+
|            features|churn|
+--------------------+-----+
|[11066.8,0.0,7.22...|    1|
|[11916.22,0.0,6.5...|    1|
|[12884.75,0.0,6.6...|    1|
|[8010.76,0.0,6.71...|    1|
|[9191.58,0.0,5.56...|    1|
|[10356.02,0.0,5.1...|    1|
|[11331.58,1.0,5.2...|    1|
|[9885.12,1.0,6.92...|    1|
|[14062.6,1.0,5.46...|    1|
|[8066.94,1.0,7.11...|    1|
|[11575.37,1.0,5.2...|    1|
|[8771.02,1.0,6.64...|    1|
|[8988.67,1.0,4.84...|    1|
|[8283.32,1.0,5.1,...|    1|
|[6569.87,1.0,4.3,...|    1|
|[10494.82,1.0,6.8...|    1|
|[8213.41,1.0,7.35...|    1|
|[11226.88,0.0,8.0...|    1|
|[5515.09,0.0,6.85...|    1|
|[8046.4,1.0,5.69,...|    1|
+--------------------+-----+
only showing top 20 rows



In [19]:
train, test = df_final.randomSplit([0.7, 0.3], seed=42)

In [20]:
from pyspark.ml.classification import LogisticRegression

In [21]:
lr = LogisticRegression(labelCol="churn")

In [22]:
lrm = lr.fit(train)

25/07/15 00:55:28 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [24]:
lrm.summary

<pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary at 0x73e5339ff340>

In [25]:
lrm_summary = lrm.summary

In [27]:
lrm_summary.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[100.0,0.0,5.27,5.0]|  0.0|[7.68039716361023...|[0.99953842169732...|       0.0|
|[3263.0,1.0,2.77,...|  0.0|[3.79790878847895...|[0.97807392717046...|       0.0|
|[3689.95,1.0,5.01...|  0.0|[-0.0852827645067...|[0.47869222186350...|       1.0|
|[3825.7,0.0,4.28,...|  0.0|[4.56197343219237...|[0.98966646378940...|       0.0|
|[4111.4,0.0,3.93,...|  0.0|[4.77939035707564...|[0.99166887171148...|       0.0|
|[4492.44,1.0,6.43...|  0.0|[1.48647065390089...|[0.81554794599693...|       0.0|
|[4711.89,0.0,4.97...|  0.0|[5.35606473980586...|[0.99530272666451...|       0.0|
|[4762.81,1.0,5.12...|  0.0|[1.07167532017754...|[0.74491538482445...|       0.0|
|[4771.65,0.0,3.77...|  1.0|[-0.0689054937107...|[0.48278043919219...|       1.0|
|[5002.58,0.0,4.

In [28]:
lrm_summary.predictions.describe().show()

+-------+------------------+-------------------+
|summary|             churn|         prediction|
+-------+------------------+-------------------+
|  count|               667|                667|
|   mean|0.1634182908545727|0.12893553223388307|
| stddev|0.3700243606477148| 0.3353800771333429|
|    min|               0.0|                0.0|
|    max|               1.0|                1.0|
+-------+------------------+-------------------+



In [30]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [31]:
pred_labels = lrm.evaluate(test)

In [33]:
pred_labels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[3676.68,1.0,3.52...|    0|[3.32280060410704...|[0.96520277626050...|       0.0|
|[4316.73,1.0,4.79...|    0|[2.51882703498393...|[0.92545117082477...|       0.0|
|[4523.91,0.0,3.98...|    0|[4.74425895305274...|[0.9913735552338,...|       0.0|
|[4690.57,1.0,5.41...|    0|[3.36151699302179...|[0.96647995669972...|       0.0|
|[4863.73,1.0,5.17...|    0|[3.51080721628381...|[0.97099370799721...|       0.0|
|[4992.6,0.0,3.85,...|    0|[4.82175711380701...|[0.99201170145350...|       0.0|
|[4994.48,0.0,5.02...|    0|[4.08636803116322...|[0.98347744084253...|       0.0|
|[5191.08,0.0,6.29...|    0|[4.52210988299157...|[0.98925072904386...|       0.0|
|[5200.06,1.0,6.33...|    0|[2.77869927377672...|[0.94151386042336...|       0.0|
|[5304.6,0.0,5.2

In [35]:
evalu = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="churn")

In [36]:
auc = evalu.evaluate(pred_labels.predictions)

In [37]:
auc

0.779662093495935

In [45]:
customers = spark.read.csv("customer_churn.csv", inferSchema=True, header=True).drop('Churn')

In [46]:
final_model = lr.fit(df_final)

In [47]:
customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [48]:
customers_valid = assembler.transform(customers)

In [49]:
customers_valid.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [50]:
res = final_model.transform(customers_valid)

In [51]:
res.select("Company", "prediction").show()

+--------------------+----------+
|             Company|prediction|
+--------------------+----------+
|          Harvey LLC|       0.0|
|          Wilson PLC|       1.0|
|Miller, Johnson a...|       1.0|
|           Smith Inc|       0.0|
|          Love-Jones|       0.0|
|        Kelly-Warren|       0.0|
|   Reynolds-Sheppard|       1.0|
|          Singh-Cole|       0.0|
|           Lopez PLC|       1.0|
|       Reed-Martinez|       1.0|
|Briggs, Lamb and ...|       0.0|
|    Figueroa-Maynard|       1.0|
|     Abbott-Thompson|       1.0|
|Smith, Kim and Ma...|       1.0|
|Snyder, Lee and M...|       0.0|
|      Sanders-Pierce|       1.0|
|Andrews, Adams an...|       1.0|
|Morgan, Phillips ...|       1.0|
|      Villanueva LLC|       0.0|
|Berry, Orr and Ca...|       0.0|
+--------------------+----------+
only showing top 20 rows



root
 |-- features: vector (nullable = true)
 |-- churn: integer (nullable = true)

