# Big Data Machine Learning Classification with Spark

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("churn.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Churn
0,0,Cameron Williams,42.0,11066.8,0,7.22,8.0,1
1,1,Kevin Mueller,41.0,11916.22,0,6.5,11.0,1
2,2,Eric Lozano,38.0,12884.75,0,6.67,12.0,1
3,3,Phillip White,42.0,8010.76,0,6.71,10.0,1
4,4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900 entries, 0 to 899
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       900 non-null    int64  
 1   Names            900 non-null    object 
 2   Age              900 non-null    float64
 3   Total_Purchase   900 non-null    float64
 4   Account_Manager  900 non-null    int64  
 5   Years            900 non-null    float64
 6   Num_Sites        900 non-null    float64
 7   Churn            900 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 56.4+ KB


In [5]:
feature_columns = df.columns[2:-1]
feature_columns

Index(['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'], dtype='object')

In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import GBTClassifier


# Step 1: Initialize Spark
spark = SparkSession.builder.appName("ChurnClassification").getOrCreate()

# Step 2: Load the dataset
data = spark.read.csv("churn.csv", inferSchema=True, header=True)

# Step 3: Prepare the data for training
feature_columns = data.columns[2:-1]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data).select("features", "Churn")

# Step 4: Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

# Step 5: Train a GBT classification model
gbt = GBTClassifier(labelCol="Churn", featuresCol="features")
model = gbt.fit(train_data)

# Step 6: Make predictions on the test data
predictions = model.transform(test_data)

# Step 7: Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="Churn")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

spark.stop()

24/03/29 10:25:25 WARN Utils: Your hostname, ahmet resolves to a loopback address: 127.0.1.1; using 192.168.0.22 instead (on interface wlo1)
24/03/29 10:25:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/29 10:25:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Accuracy: 0.8431148373983745


## Sonuç 

Büyük verilerin işlenmesini kolaylaştıran `Spark` ile sınıflandırma modeli geliştirdik. 