In [1]:
sc

In [2]:
spark

### 1.Read the DataSet

In [3]:
churn_data = spark.read.csv("file:///home/hadoop/Downloads/Telco_Customer_Churn.csv", inferSchema = True,header = True)

In [5]:
churn_data.show(5)

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|      Contract|PaperlessBilling|       PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+--------------------+--------------+------------+-----+
|7590-VHVEG|Female|            0|    Yes|        No|     1|          No|No phone service|            DSL|            No|         Yes|              No|         No|    

### 2.Data Exploration
#### a)How many customer records are in the dataset ?

In [6]:
churn_data.count()

7043

In [8]:
len(churn_data.columns)

21

In [10]:
churn_data.columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

#### b)What is the distribution of gender among clusters?

In [11]:
churn_data.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: string (nullable = true)
 |-- Churn: string (nullable = true)



In [12]:
churn_data.groupBy(['gender']).count().show()

+------+-----+
|gender|count|
+------+-----+
|Female| 3488|
|  Male| 3555|
+------+-----+



#### c)What is the distribution of contract types among customers?

In [13]:
churn_data.groupBy(['Contract']).count().show()

+--------------+-----+
|      Contract|count|
+--------------+-----+
|Month-to-month| 3875|
|      One year| 1473|
|      Two year| 1695|
+--------------+-----+



#### d)What is the percentage of customers who has churned?

In [17]:
yes_count = churn_data.select(['Churn']).where("Churn ='Yes'").count()
yes_count

1869

In [18]:
yes_count/churn_data.count()*100

26.536987079369588

### 3.Data Prepreprocessing
* Check for missing values and handle missing data

In [19]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [21]:
churn_data.select([count(when(isnull(col),col)).alias(col) for col in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [24]:
churn_data = churn_data.withColumn('TotalCharges',when(col('TotalCharges')==" ",None)\
.otherwise(col('TotalCharges')))

In [25]:
churn_data.select([count(when(isnull(col),col)).alias(col) for col in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [26]:
churn_data1 = churn_data.na.drop()

In [30]:
from pyspark.sql.types import FloatType
churn_data1 = churn_data1.withColumn("TotalCharges",col("TotalCharges").cast(FloatType()))

In [31]:
churn_data1

DataFrame[customerID: string, gender: string, SeniorCitizen: int, Partner: string, Dependents: string, tenure: int, PhoneService: string, MultipleLines: string, InternetService: string, OnlineSecurity: string, OnlineBackup: string, DeviceProtection: string, TechSupport: string, StreamingTV: string, StreamingMovies: string, Contract: string, PaperlessBilling: string, PaymentMethod: string, MonthlyCharges: double, TotalCharges: float, Churn: string]

### 4.Import Mllib
* Convert categorical variables into numerical formats using one-hot encoding or label encoding

In [51]:
churn_data1 = churn_data1.drop(col('customerID'))

In [32]:
from pyspark.ml.feature import StringIndexer,OneHotEncoderEstimator,VectorAssembler
from pyspark.ml import Pipeline

In [52]:
categorical_cols = [field.name for field in churn_data1.schema.fields if isinstance(field.dataType,StringType)]

In [53]:
categorical_cols

['gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [64]:
stages = []
for catcols in categorical_cols[:-1]:
    stringindexer = StringIndexer(inputCol = catcols,outputCol = catcols + "Index")
    onehotencoder = OneHotEncoderEstimator(inputCols = [stringindexer.getOutputCol()],
                                           outputCols = [catcols + "classVec"])
    stages+=[stringindexer,onehotencoder]


In [65]:
numericalCols = [field.name for field in churn_data1.schema.fields if not isinstance(field.dataType,StringType)]

In [66]:
numericalCols

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [67]:
assemblerInputs = [c+ "classVec" for c in categorical_cols[:-1]] + numericalCols

In [68]:
assembler = VectorAssembler(inputCols = assemblerInputs,outputCol="features")
stages+=[assembler]

In [69]:
label_stringIdx = StringIndexer(inputCol='Churn',outputCol='Label')

In [70]:
stages+=[label_stringIdx]

In [71]:
pipeline = Pipeline(stages = stages)

In [72]:
preprocessing = pipeline.fit(churn_data1)

In [73]:
churn_df = preprocessing.transform(churn_data1)

In [75]:
churn_df.select(['gender','genderIndex','genderclassVec']).show()

+------+-----------+--------------+
|gender|genderIndex|genderclassVec|
+------+-----------+--------------+
|Female|        1.0|     (1,[],[])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|Female|        1.0|     (1,[],[])|
|Female|        1.0|     (1,[],[])|
|  Male|        0.0| (1,[0],[1.0])|
|Female|        1.0|     (1,[],[])|
|Female|        1.0|     (1,[],[])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|  Male|        0.0| (1,[0],[1.0])|
|Female|        1.0|     (1,[],[])|
|Female|        1.0|     (1,[],[])|
|  Male|        0.0| (1,[0],[1.0])|
|Female|        1.0|     (1,[],[])|
|Female|        1.0|     (1,[],[])|
+------+-----------+--------------+
only showing top 20 rows



In [76]:
churn_df1 = churn_df.select(['features','Label'])

In [77]:
churn_df1.show(truncate = False)

+------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                        |Label|
+------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|(30,[2,7,8,11,12,14,16,18,20,22,23,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.85,29.850000381469727])                        |0.0  |
|(30,[0,1,2,3,4,7,9,10,13,14,16,18,24,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,34.0,56.95,1889.5])                         |0.0  |
|(30,[0,1,2,3,4,7,9,11,12,14,16,18,20,22,24,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,53.85,108.1500015258789]) |1.0  |
|(30,[0,1,2,7,9,10,13,15,16,18,25,27,28,29],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1

### Split Dataset into Train and Test

In [115]:
train ,test = churn_df1.randomSplit([0.8,0.2],seed = 20)

In [116]:
train.select(['features','label'])

DataFrame[features: vector, label: double]

### Build Decision Tree ML Model

In [117]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
tree = DecisionTreeClassifier(featuresCol = 'features',labelCol = 'Label')
decision_model = tree.fit(train)

### Evaluate the Model

In [118]:
predictions = decision_model.transform(test)

In [119]:
predictions.show()

+--------------------+-----+-------------+--------------------+----------+
|            features|Label|rawPrediction|         probability|prediction|
+--------------------+-----+-------------+--------------------+----------+
|(30,[0,1,2,3,4,6,...|  1.0| [26.0,158.0]|[0.14130434782608...|       1.0|
|(30,[0,1,2,3,4,6,...|  1.0|[225.0,391.0]|[0.36525974025974...|       1.0|
|(30,[0,1,2,3,4,6,...|  0.0|[225.0,391.0]|[0.36525974025974...|       1.0|
|(30,[0,1,2,3,4,6,...|  0.0|[279.0,138.0]|[0.66906474820143...|       0.0|
|(30,[0,1,2,3,4,6,...|  1.0| [26.0,158.0]|[0.14130434782608...|       1.0|
|(30,[0,1,2,3,4,6,...|  1.0| [26.0,158.0]|[0.14130434782608...|       1.0|
|(30,[0,1,2,3,4,6,...|  0.0|[214.0,222.0]|[0.49082568807339...|       1.0|
|(30,[0,1,2,3,4,6,...|  0.0|[225.0,391.0]|[0.36525974025974...|       1.0|
|(30,[0,1,2,3,4,6,...|  0.0|[279.0,138.0]|[0.66906474820143...|       0.0|
|(30,[0,1,2,3,4,6,...|  0.0|[279.0,138.0]|[0.66906474820143...|       0.0|
|(30,[0,1,2,3,4,6,...|  1

In [120]:
predictions.select(['Label','probability','prediction']).show(truncate=False)

+-----+----------------------------------------+----------+
|Label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1.0  |[0.14130434782608695,0.8586956521739131]|1.0       |
|1.0  |[0.3652597402597403,0.6347402597402597] |1.0       |
|0.0  |[0.3652597402597403,0.6347402597402597] |1.0       |
|0.0  |[0.6690647482014388,0.33093525179856115]|0.0       |
|1.0  |[0.14130434782608695,0.8586956521739131]|1.0       |
|1.0  |[0.14130434782608695,0.8586956521739131]|1.0       |
|0.0  |[0.4908256880733945,0.5091743119266054] |1.0       |
|0.0  |[0.3652597402597403,0.6347402597402597] |1.0       |
|0.0  |[0.6690647482014388,0.33093525179856115]|0.0       |
|0.0  |[0.6690647482014388,0.33093525179856115]|0.0       |
|1.0  |[0.14130434782608695,0.8586956521739131]|1.0       |
|1.0  |[0.3652597402597403,0.6347402597402597] |1.0       |
|1.0  |[0.3652597402597403,0.6347402597402597] |1.0       |
|0.0  |[0.6690647482014388,0.33093525179

In [121]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='Label',predictionCol='prediction',metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

0.8085714285714286

### Build Random Forest Model

In [122]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
tree = RandomForestClassifier(featuresCol = 'features',labelCol = 'Label')
rabdom_forest_model = tree.fit(train)

In [126]:
predictions = rabdom_forest_model.transform(test)

In [127]:
predictions.select(['Label','probability','prediction']).show(truncate=False)

+-----+----------------------------------------+----------+
|Label|probability                             |prediction|
+-----+----------------------------------------+----------+
|1.0  |[0.3303275799665036,0.6696724200334964] |1.0       |
|1.0  |[0.355683591997274,0.6443164080027259]  |1.0       |
|0.0  |[0.4409886512569914,0.5590113487430086] |1.0       |
|0.0  |[0.6048014388148135,0.3951985611851864] |0.0       |
|1.0  |[0.3687427161910374,0.6312572838089626] |1.0       |
|1.0  |[0.35988373071672364,0.6401162692832763]|1.0       |
|0.0  |[0.5050190284201042,0.4949809715798959] |0.0       |
|0.0  |[0.5329808253129225,0.4670191746870776] |0.0       |
|0.0  |[0.629261060471018,0.37073893952898196] |0.0       |
|0.0  |[0.6880915860367176,0.31190841396328234]|0.0       |
|1.0  |[0.30747113040835805,0.692528869591642] |1.0       |
|1.0  |[0.3328271424391285,0.6671728575608715] |1.0       |
|1.0  |[0.38968149267222585,0.6103185073277742]|1.0       |
|0.0  |[0.6346179949310309,0.36538200506

In [128]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='Label',predictionCol='prediction',metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

0.7935714285714286

### Logistic Regression ML Model
* Most suitable for binary classification

In [129]:
from pyspark.ml.classification import LogisticRegression
logistic = LogisticRegression(featuresCol = 'features',labelCol = 'Label')
logistic_model = logistic.fit(train)

In [130]:
predictions = logistic_model.transform(test)

In [131]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='Label',predictionCol='prediction',metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

0.8071428571428572