In [2]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("ChurnPrediction").getOrCreate()

# Confirm Spark session started
spark

In [11]:
df = spark.read.csv(r'C:/Users/Tarun Akash/Desktop/df_clean.csv', header=True, inferSchema=True)

df.show(20)


+-----+------------+---------------+--------+---------+-------------+-------+--------+-------------+----------+--------+
|Churn|AccountWeeks|ContractRenewal|DataPlan|DataUsage|CustServCalls|DayMins|DayCalls|MonthlyCharge|OverageFee|RoamMins|
+-----+------------+---------------+--------+---------+-------------+-------+--------+-------------+----------+--------+
|    0|         128|              1|       1|      2.7|            1|  265.1|     110|         89.0|      9.87|    10.0|
|    0|         107|              1|       1|      3.7|            1|  161.6|     123|         82.0|      9.78|    13.7|
|    0|         137|              1|       0|      0.0|            0|  243.4|     114|         52.0|      6.06|    12.2|
|    0|          84|              0|       0|      0.0|            2|  299.4|      71|         57.0|       3.1|     6.6|
|    0|          75|              0|       0|      0.0|            3|  166.7|     113|         41.0|      7.42|    10.1|
|    0|         118|            

In [13]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Convert all columns (except 'Churn') to a single feature vector
assembler = VectorAssembler(inputCols=[col for col in data.columns if col != 'Churn'], outputCol='features')
data = assembler.transform(data)

# Convert 'Churn' column to a numerical label if necessary (string to int)
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Churn', outputCol='label')
data = indexer.fit(data).transform(data)

# Select 'features' and 'label' for Naive Bayes
final_data = data.select('features', 'label')
final_data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[128.0,1.0,1.0,2....|  0.0|
|[107.0,1.0,1.0,3....|  0.0|
|[137.0,1.0,0.0,0....|  0.0|
|[84.0,0.0,0.0,0.0...|  0.0|
|[75.0,0.0,0.0,0.0...|  0.0|
+--------------------+-----+
only showing top 5 rows



In [14]:
train_data, test_data = final_data.randomSplit([0.7, 0.3], seed=42)


In [15]:
# Initialize Naive Bayes model
nb = NaiveBayes()

# Fit the model
model = nb.fit(train_data)

# Make predictions
predictions = model.transform(test_data)
predictions.show(5)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[1.0,1.0,0.0,0.0,...|  0.0|[-431.75704228353...|[0.00328453757668...|       1.0|
|[1.0,1.0,1.0,2.27...|  0.0|[-578.43789929523...|[0.28084196880843...|       1.0|
|[2.0,0.0,0.0,0.27...|  1.0|[-345.99539695062...|[0.00731949380150...|       1.0|
|[3.0,0.0,0.0,0.26...|  0.0|[-494.93660625489...|[0.08835814041598...|       1.0|
|[3.0,1.0,1.0,3.21...|  0.0|[-575.88244789392...|[0.99852230307546...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [16]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Calculate accuracy
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

# Confusion Matrix
predictions.groupBy("label", "prediction").count().show()

Accuracy: 0.6867724867724868
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|   60|
|  0.0|       1.0|  220|
|  1.0|       0.0|   76|
|  0.0|       0.0|  589|
+-----+----------+-----+

