In [1]:
sc

In [2]:
spark

#### 1. Read the Dataset

In [6]:
churn_data = spark.read.csv('file:///home/hadoop/Downloads/Telco_Customer_Churn.csv',header=True,inferSchema=True)
churn_data.show(3)

+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+----------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|   MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|      Contract|PaperlessBilling|   PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+----------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------------+----------------+----------------+--------------+------------+-----+
|7590-VHVEG|Female|            0|    Yes|        No|     1|          No|No phone service|            DSL|            No|         Yes|              No|         No|         No|    

#### 2. Data Exploration
        a) How many customers are in dataset?

In [7]:
churn_data.count()

7043

In [None]:
len(churn_data.columns)

In [9]:
churn_data.columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

        b) What is the distribution of gender among customers?

In [10]:
churn_data.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: string (nullable = true)
 |-- Churn: string (nullable = true)



In [13]:
churn_data.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|Female| 3488|
|  Male| 3555|
+------+-----+



        c) What is the distribution of contract types among customers?

In [15]:
churn_data.groupBy('contract').count().show()

+--------------+-----+
|      contract|count|
+--------------+-----+
|Month-to-month| 3875|
|      One year| 1473|
|      Two year| 1695|
+--------------+-----+



        d) What is the percentage of Customers who churned?

In [19]:
churn_data.select(['Churn']).where("Churn = 'Yes'").count() / churn_data.count() * 100

26.536987079369588

#### 3. Data Preprocessing
        * Check for missing values and Handle missing data.

In [20]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [25]:
churn_data.select([count(when(isnull(column),column)).alias(column) for column in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [101]:
churn_data = churn_data.withColumn('TotalCharges', when(col('TotalCharges') == " ", None).otherwise(col('TotalCharges')))

In [102]:
churn_data.select([count(when(isnull(column),column)).alias(column) for column in churn_data.columns]).show()

+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|customerID|gender|SeniorCitizen|Partner|Dependents|tenure|PhoneService|MultipleLines|InternetService|OnlineSecurity|OnlineBackup|DeviceProtection|TechSupport|StreamingTV|StreamingMovies|Contract|PaperlessBilling|PaymentMethod|MonthlyCharges|TotalCharges|Churn|
+----------+------+-------------+-------+----------+------+------------+-------------+---------------+--------------+------------+----------------+-----------+-----------+---------------+--------+----------------+-------------+--------------+------------+-----+
|         0|     0|            0|      0|         0|     0|           0|            0|              0|             0|           0|               0|          0|          0|              0|       0|               0| 

In [103]:
churn_df = churn_data.na.drop()

In [104]:
churn_df = churn_df.withColumn('TotalCharges',col('TotalCharges').cast(FloatType()))
churn_df = churn_df.drop(col('cutomerID'))

In [105]:
churn_df.printSchema()

root
 |-- customerID: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- SeniorCitizen: integer (nullable = true)
 |-- Partner: string (nullable = true)
 |-- Dependents: string (nullable = true)
 |-- tenure: integer (nullable = true)
 |-- PhoneService: string (nullable = true)
 |-- MultipleLines: string (nullable = true)
 |-- InternetService: string (nullable = true)
 |-- OnlineSecurity: string (nullable = true)
 |-- OnlineBackup: string (nullable = true)
 |-- DeviceProtection: string (nullable = true)
 |-- TechSupport: string (nullable = true)
 |-- StreamingTV: string (nullable = true)
 |-- StreamingMovies: string (nullable = true)
 |-- Contract: string (nullable = true)
 |-- PaperlessBilling: string (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- MonthlyCharges: double (nullable = true)
 |-- TotalCharges: float (nullable = true)
 |-- Churn: string (nullable = true)



#### 4. Import ML Library (mllib)
        f) Convert catagorical variables into numerical formats using one-hot encoding or label encoding.

In [106]:
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml import Pipeline

In [107]:
categorical_cols = [field.name for field in churn_df.schema.fields if isinstance(field.dataType, StringType)]

In [108]:
# category = category[1:-1]

In [109]:
stages = []
for cat_cols in categorical_cols[:-1]:
    stringindexer = StringIndexer(inputCol=cat_cols, outputCol=cat_cols + 'Index')
    onehotencoder = OneHotEncoderEstimator(inputCols = [stringindexer.getOutputCol()],
                                        outputCols = [cat_cols + "classVec"])
    stages += [stringindexer, onehotencoder]

In [110]:
numeric_cols = [field.name for field in churn_df.schema.fields if not isinstance(field.dataType, StringType)]

In [111]:
assemblerInputs = [c + 'classVec' for c in categorical_cols[:-1]]  + numeric_cols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')
stages += [assembler]

In [112]:
label_stringIdx = StringIndexer(inputCol='Churn', outputCol='label')

In [113]:
stages += [label_stringIdx]

In [114]:
pipeline = Pipeline(stages = stages)

In [115]:
preprocessing = pipeline.fit(churn_df)

In [116]:
churn_dataFrame = preprocessing.transform(churn_df)

In [None]:
churn_dataFrame.select(['Contract','ContractIndex','ContractclassVec']).show()

In [118]:
churn_df1 = churn_dataFrame.select(['features','label'])
churn_df1.show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                                                                                              |label|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|(7061,[1950,7033,7038,7039,7042,7043,7045,7047,7049,7051,7053,7054,7058,7059,7060],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,29.85,29.850000381469727])                                    |0.0  |
|(7061,[776,7031,7032,7033,7034,7035,7038,7040,7041,7044,7045,7047,7049,7055,7058,7059,7060],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,34.0,56.95,1889.5])   

### Split dataset into Train and Test

In [173]:
train, test = churn_df1.randomSplit([0.8, 0.2], seed = 4)

In [174]:
train.select(['features','label'])

DataFrame[features: vector, label: double]

#### Build theDecision Tree ML Model

In [175]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
tree = DecisionTreeClassifier(featuresCol='features', labelCol='label')
decision_model = tree.fit(train)

#### Evalute the Model

In [176]:
predictions = decision_model.transform(test)

In [177]:
predictions.select(['label','probability','prediction']).show(truncate=False)

+-----+-----------------------------------------+----------+
|label|probability                              |prediction|
+-----+-----------------------------------------+----------+
|0.0  |[0.9773507206588882,0.022649279341111873]|0.0       |
|0.0  |[0.9773507206588882,0.022649279341111873]|0.0       |
|0.0  |[0.6788321167883211,0.32116788321167883] |0.0       |
|0.0  |[0.9079601990049752,0.09203980099502487] |0.0       |
|0.0  |[0.8532423208191127,0.14675767918088736] |0.0       |
|0.0  |[0.9079601990049752,0.09203980099502487] |0.0       |
|0.0  |[0.8532423208191127,0.14675767918088736] |0.0       |
|0.0  |[0.707641196013289,0.292358803986711]    |0.0       |
|0.0  |[0.35590551181102364,0.6440944881889764] |1.0       |
|0.0  |[0.9079601990049752,0.09203980099502487] |0.0       |
|1.0  |[0.9773507206588882,0.022649279341111873]|0.0       |
|0.0  |[0.8532423208191127,0.14675767918088736] |0.0       |
|0.0  |[0.9773507206588882,0.022649279341111873]|0.0       |
|0.0  |[0.75958188153310

In [179]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
accuracy

0.7988942639944713

#### Random Forest Classifier

In [180]:
from pyspark.ml.classification import RandomForestClassifier
randomForest = RandomForestClassifier(featuresCol='features', labelCol='label')
rf_model = randomForest.fit(train)

In [181]:
predictions = rf_model.transform(test)

In [182]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
accuracy

0.7456807187284036

#### Logistic Regression

In [183]:
from pyspark.ml.classification import LogisticRegression
logistic = LogisticRegression(featuresCol='features', labelCol='label')
log_model = logistic.fit(train)

In [184]:
predictions = log_model.transform(test)

In [185]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(predictions)
accuracy

0.8182446440912232