In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ML_Pipeline") \
    .getOrCreate()

df = spark.read.csv("../data/processed/data-encoded.csv", header=True, inferSchema=True)

### Gestion du déséquilibre de classes

In [12]:
major = df.filter(df.Exited == 0)
minor = df.filter(df.Exited == 1)

ratio = minor.count() / major.count()
major_sampled = major.sample(withReplacement=False, fraction=ratio)

df_balanced = major_sampled.union(minor)

### Sélection et assemblage des features

In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# Step 1: index categorical columns
indexers = [
    StringIndexer(inputCol="Geography", outputCol="Geography_index"),
    StringIndexer(inputCol="Gender", outputCol="Gender_index")
]

# Step 2: one-hot encode them
encoders = [
    OneHotEncoder(
        inputCols=["Geography_index", "Gender_index"],
        outputCols=["Geography_vec", "Gender_vec"]
    )
]

# Step 3: assemble numeric + encoded features
assembler = VectorAssembler(
    inputCols=["CreditScore", "Age", "Balance", "NumOfProducts", "Geography_vec", "Gender_vec"],
    outputCol="features"
)


### Sélection et assemblage des features

In [14]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaledFeatures",
    withMean=True,
    withStd=True
)

### Séparation du dataset

In [15]:
train_df, test_df = df_balanced.randomSplit([0.8, 0.2], seed=42)

### Choix du modèle MLlib

In [16]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="scaledFeatures",
    labelCol="Exited"
)

### Construction du Pipeline

In [17]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[
    assembler,
    scaler,
    lr
])

### Entraînement et évaluation

In [18]:
model = pipeline.fit(train_df)
predictions = model.transform(test_df)

predictions.select("Exited", "prediction", "probability").show(5)

IllegalArgumentException: [FIELD_NOT_FOUND] No such struct field `Geography_vec` in `CreditScore`, `Geography`, `Gender`, `Age`, `Tenure`, `Balance`, `NumOfProducts`, `HasCrCard`, `IsActiveMember`, `EstimatedSalary`, `Exited`. SQLSTATE: 42704

### Évaluation du modèle

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="Exited",
    rawPredictionCol="prediction",
    metricName="areaUnderROC"
)

roc_auc = evaluator.evaluate(predictions)
print("AUC =", roc_auc)

### Sauvegarde du modèle

In [None]:
model.write().overwrite().save("models/churn_model")