In [6]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.appName("ChurnPrediction").getOrCreate()


In [7]:
# Load CSV data into Spark DataFrame
data = spark.read.csv(r'C:/Users/Tarun Akash/Desktop/df_clean.csv', header=True, inferSchema=True)

# Show the first few rows of the DataFrame
data.show(5)

+-----+------------+---------------+--------+---------+-------------+-------+--------+-------------+----------+--------+
|Churn|AccountWeeks|ContractRenewal|DataPlan|DataUsage|CustServCalls|DayMins|DayCalls|MonthlyCharge|OverageFee|RoamMins|
+-----+------------+---------------+--------+---------+-------------+-------+--------+-------------+----------+--------+
|    0|         128|              1|       1|      2.7|            1|  265.1|     110|         89.0|      9.87|    10.0|
|    0|         107|              1|       1|      3.7|            1|  161.6|     123|         82.0|      9.78|    13.7|
|    0|         137|              1|       0|      0.0|            0|  243.4|     114|         52.0|      6.06|    12.2|
|    0|          84|              0|       0|      0.0|            2|  299.4|      71|         57.0|       3.1|     6.6|
|    0|          75|              0|       0|      0.0|            3|  166.7|     113|         41.0|      7.42|    10.1|
+-----+------------+------------

In [8]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

# Convert all columns except 'Churn' into a single feature vector
assembler = VectorAssembler(inputCols=[col for col in data.columns if col != 'Churn'], outputCol='features')
data = assembler.transform(data)

# Convert 'Churn' into a numerical label (if it's categorical)
indexer = StringIndexer(inputCol='Churn', outputCol='label')
data = indexer.fit(data).transform(data)

# Select only the 'features' and 'label' columns for Logistic Regression
final_data = data.select('features', 'label')

# Show the prepared data
final_data.show(5)


+--------------------+-----+
|            features|label|
+--------------------+-----+
|[128.0,1.0,1.0,2....|  0.0|
|[107.0,1.0,1.0,3....|  0.0|
|[137.0,1.0,0.0,0....|  0.0|
|[84.0,0.0,0.0,0.0...|  0.0|
|[75.0,0.0,0.0,0.0...|  0.0|
+--------------------+-----+
only showing top 5 rows



In [9]:
# Split the data into training and test sets (70% training, 30% testing)
train_data, test_data = final_data.randomSplit([0.7, 0.3], seed=42)


In [10]:
from pyspark.ml.classification import LogisticRegression

# Initialize the Logistic Regression model
lr = LogisticRegression()

# Fit the model on the training data
lr_model = lr.fit(train_data)


In [11]:
# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Show the predictions
predictions.select("features", "label", "prediction").show(5)


+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[1.0,1.0,0.0,0.0,...|  0.0|       0.0|
|[1.0,1.0,1.0,2.27...|  0.0|       0.0|
|[2.0,0.0,0.0,0.27...|  1.0|       0.0|
|[3.0,0.0,0.0,0.26...|  0.0|       0.0|
|[3.0,1.0,1.0,3.21...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 5 rows



In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Initialize the evaluator with 'accuracy' metric
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Calculate accuracy on the test data
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

# Optionally, show the confusion matrix
predictions.groupBy("label", "prediction").count().show()


Accuracy: 0.8603174603174604
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|   24|
|  0.0|       1.0|   20|
|  1.0|       0.0|  112|
|  0.0|       0.0|  789|
+-----+----------+-----+

