In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('CC').getOrCreate()

In [2]:
df = spark.read.csv('../data/customer_churn.csv',header=True,inferSchema=True)

In [3]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [17]:
df.head(5)

[Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.8, Account_Manager=0, Years=7.22, Num_Sites=8.0, Onboard_date='2013-08-30 07:00:40', Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1),
 Row(Names='Kevin Mueller', Age=41.0, Total_Purchase=11916.22, Account_Manager=0, Years=6.5, Num_Sites=11.0, Onboard_date='2013-08-13 00:38:46', Location='6157 Frank Gardens Suite 019 Carloshaven, RI 17756', Company='Wilson PLC', Churn=1),
 Row(Names='Eric Lozano', Age=38.0, Total_Purchase=12884.75, Account_Manager=0, Years=6.67, Num_Sites=12.0, Onboard_date='2016-06-29 06:20:07', Location='1331 Keith Court Alyssahaven, DE 90114', Company='Miller, Johnson and Wallace', Churn=1),
 Row(Names='Phillip White', Age=42.0, Total_Purchase=8010.76, Account_Manager=0, Years=6.71, Num_Sites=10.0, Onboard_date='2014-04-22 12:43:12', Location='13120 Daniel Mount Angelabury, WY 30645-4695', Company='Smith Inc', Churn=1),
 Row(Names='Cynthia Norton', Age=37.0, Total_P

In [19]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
date_indexer=StringIndexer(inputCol='Onboard_date',outputCol='date_index')

In [64]:
df_ind=date_indexer.fit(df).transform(df)
assembler = VectorAssembler(inputCols=['Age','Total_Purchase','Account_Manager','Years','Num_Sites','date_index'],outputCol='features')

In [65]:
features=assembler.transform(df_ind)
data=features.select(['features','Churn'])
data.show(5)

+--------------------+-----+
|            features|Churn|
+--------------------+-----+
|[42.0,11066.8,0.0...|    1|
|[41.0,11916.22,0....|    1|
|[38.0,12884.75,0....|    1|
|[42.0,8010.76,0.0...|    1|
|[37.0,9191.58,0.0...|    1|
+--------------------+-----+
only showing top 5 rows



In [66]:
train,test=data.randomSplit(seed=0,weights=[0.7,0.3])

In [67]:
from pyspark.ml.classification import LogisticRegression 
logR=LogisticRegression(labelCol='Churn')

In [68]:
log_model=logR.fit(train)

In [69]:
evaluation=log_model.evaluate(test)

In [70]:
print("Area under ROC is: ",evaluation.areaUnderROC)
print("Accuracy is: ",evaluation.accuracy)

Area under ROC is:  0.918825812712276
Accuracy is:  0.8978102189781022


In [80]:
predictionAndLabels = log_model.evaluate(test).predictions.select('Churn', 'prediction')

In [84]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evalBC = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn',metricName="areaUnderROC")
evalMC=MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Churn', metricName='accuracy')
print("Area under ROC is: ",evalBC.evaluate(predictionAndLabels))
print("Accuracy is: ",evalMC.evaluate(predictionAndLabels))

Area under ROC is:  0.8049490538573507
Accuracy is:  0.8978102189781022


## Without "Account_Manager"

In [71]:
df_ind=date_indexer.fit(df).transform(df)
assembler2 = VectorAssembler(inputCols=['Age','Total_Purchase','Years','Num_Sites','date_index'],outputCol='features')
features2=assembler2.transform(df_ind)
data2=features2.select(['features','Churn'])
train2,test2=data2.randomSplit(seed=0,weights=[0.7,0.3])
log_model2=logR.fit(train2)
evaluation2=log_model2.evaluate(test2)
print("Area under ROC is: ",evaluation2.areaUnderROC)
print("Accuracy is: ",evaluation2.accuracy)

Area under ROC is:  0.9161086851043186
Accuracy is:  0.9087591240875912


In [85]:
predictionAndLabels = log_model2.evaluate(test2).predictions.select('Churn', 'prediction')
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evalBC = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn',metricName="areaUnderROC")
evalMC=MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Churn', metricName='accuracy')
print("Area under ROC is: ",evalBC.evaluate(predictionAndLabels))
print("Accuracy is: ",evalMC.evaluate(predictionAndLabels))

Area under ROC is:  0.811499272197962
Accuracy is:  0.9087591240875912


Predictions are better without the "Account_Manager" variable.