In [None]:
pip install pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline

In [20]:
# Spark oturumu başlat
spark = SparkSession.builder.appName("Müşteri Terk Tahmini").getOrCreate()

In [21]:
df = spark.read.csv('churn.csv', header=True, inferSchema=True)

In [22]:
df.show(10)

+---+----------------+----+--------------+---------------+-----+---------+-----+
|_c0|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|Churn|
+---+----------------+----+--------------+---------------+-----+---------+-----+
|  0|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|    1|
|  1|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|    1|
|  2|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|    1|
|  3|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|    1|
|  4|  Cynthia Norton|37.0|       9191.58|              0| 5.56|      9.0|    1|
|  5|Jessica Williams|48.0|      10356.02|              0| 5.12|      8.0|    1|
|  6|     Eric Butler|44.0|      11331.58|              1| 5.23|     11.0|    1|
|  7|   Zachary Walsh|32.0|       9885.12|              1| 6.92|      9.0|    1|
|  8|     Ashlee Carr|43.0|       14062.6|              1| 5.46|     11.0|    1|
|  9|  Jennifer Lynch|40.0| 

In [23]:
df.columns

['_c0',
 'Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Churn']

In [24]:
# Split the data into training and testing sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=123)

In [25]:
feature_cols = ['Age','Total_Purchase','Account_Manager','Years','Num_Sites']
label_col = "Churn"

In [26]:
# Assemble the feature vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_data = assembler.transform(train_data.select(*feature_cols, label_col))
test_data = assembler.transform(test_data.select(*feature_cols, label_col))

In [27]:
# Create and train a Logistic Regression model
lr = LogisticRegression(labelCol=label_col, featuresCol="features")
model = lr.fit(train_data)

In [28]:
# Make predictions on the testing data
predictions = model.transform(test_data)

In [29]:
# Evaluate the model's performance using a BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol=label_col)
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)


Accuracy: 0.9110597140454166


In [18]:
# Stop the Spark session
spark.stop()