# Pr√©diction de l'Attrition Client Bancaire : Pipeline de Machine Learning Distribu√© avec Apache Spark

* imports : 

In [1]:


from pyspark.sql import SparkSession
from pymongo import MongoClient
import pyspark
import pandas as pd
import os
import sys
import numpy as np

from pyspark import StorageLevel
from pyspark.sql import functions as F
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from imblearn.over_sampling import SMOTE
import numpy as np
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import functions as F



## üü¢ Construction du Pipeline de Machine Learning

### 1Ô∏è‚É£ R√©cup√©ration des donn√©es pr√©trait√©es

In [2]:

# Configuration des variables d'environnement Python
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


In [3]:


print("üîç V√©rification de la version PySpark...")
print(f"Version PySpark: {pyspark.__version__}")



üîç V√©rification de la version PySpark...
Version PySpark: 3.5.7


In [4]:


# Cr√©er une session Spark simple (sans MongoDB JAR)
spark = SparkSession.builder \
    .appName("MongoDB-PySpark-PyMongo") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.python.worker.timeout", "600") \
    .master("local[*]") \
    .config("spark.hadoop.io.nativeio.NativeIO$Windows.enabled", "false") \
    .getOrCreate()
    
    # .config("spark.hadoop.io.nativeio.NativeIO.disable.native", "true") \
    


In [5]:

# Connexion √† MongoDB avec PyMongo
print("\nüîå Connexion √† MongoDB...")
client = MongoClient("mongodb://localhost:27017/")
db = client["Attrition_Client_Bancaire_db"]
collection = db["clients_pretraite"]



üîå Connexion √† MongoDB...


In [6]:

# Compter les documents
total_docs = collection.count_documents({})
print(f"üìä Total documents dans MongoDB : {total_docs}")

# Lire les donn√©es
data = list(collection.find())

# Convertir en DataFrame Pandas
df_pandas = pd.DataFrame(data)

display(df_pandas.head(5))


üìä Total documents dans MongoDB : 10000


Unnamed: 0,_id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Index,Gender_Index
0,690e19f332753e87679009a5,619.0,3.7612,2,0.0,1,1,1,101348.88,1,0,1
1,690e19f332753e87679009a6,608.0,3.73767,1,83807.86,1,0,1,112542.58,0,2,1
2,690e19f332753e87679009a7,502.0,3.7612,8,159660.8,3,1,0,113931.57,1,0,1
3,690e19f332753e87679009a8,699.0,3.688879,1,0.0,2,0,0,93826.63,0,0,1
4,690e19f332753e87679009a9,850.0,3.78419,2,125510.82,1,1,1,79084.1,0,2,1


In [7]:

# Supprimer le champ _id 
if '_id' in df_pandas.columns:
    df_pandas = df_pandas.drop('_id', axis=1)

print(f"‚úÖ Donn√©es charg√©es dans Pandas : {len(df_pandas)} lignes")
print(f"üìã Colonnes : {list(df_pandas.columns)}")


‚úÖ Donn√©es charg√©es dans Pandas : 10000 lignes
üìã Colonnes : ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'Geography_Index', 'Gender_Index']


In [8]:

# Convertir en DataFrame Spark
df = spark.createDataFrame(df_pandas)

print("\n‚úÖ Donn√©es converties en Spark DataFrame :")
df.printSchema()
df.show(5)

print(f"\nüìä Nombre total de lignes : {df.count()}")

client.close()


‚úÖ Donn√©es converties en Spark DataFrame :
root
 |-- CreditScore: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Tenure: long (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: long (nullable = true)
 |-- HasCrCard: long (nullable = true)
 |-- IsActiveMember: long (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: long (nullable = true)
 |-- Geography_Index: long (nullable = true)
 |-- Gender_Index: long (nullable = true)

+-----------+------------------+------+---------+-------------+---------+--------------+---------------+------+---------------+------------+
|CreditScore|               Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Geography_Index|Gender_Index|
+-----------+------------------+------+---------+-------------+---------+--------------+---------------+------+---------------+------------+
|      619.0|3.7612001156935624|     2|      0.0|            1|        1|   

### 2Ô∏è‚É£ Gestion du d√©s√©quilibre de classes (undersampling / SMOTE)

In [9]:

data = df.select("Exited", "CreditScore", "Age", "Balance", "EstimatedSalary",
                 "Tenure", "NumOfProducts", "HasCrCard", "IsActiveMember", "Geography_Index", "Gender_Index" ).collect()

# display(data)


X = np.array([[row["CreditScore"], row["Age"], row["Balance"], row["EstimatedSalary"],
               row["Tenure"], row["NumOfProducts"], row["HasCrCard"], row["IsActiveMember"],row["Geography_Index"], row["Gender_Index"] ]
              for row in data])

y = np.array([row["Exited"] for row in data])

print("Avant SMOTE :")
print("Classe 0 :", sum(y == 0))
print("Classe 1 :", sum(y == 1))

Avant SMOTE :
Classe 0 : 7963
Classe 1 : 2037


In [10]:

smote = SMOTE(random_state=42, sampling_strategy="auto")
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\nApr√®s SMOTE :")
unique, counts = np.unique(y_resampled, return_counts=True)
for cls, cnt in zip(unique, counts):
    print(f"Classe {cls} : {cnt}")



Apr√®s SMOTE :
Classe 0 : 7963
Classe 1 : 7963


In [11]:
import pandas as pd
import numpy as np

# Supposons que X_resampled et y_resampled existent d√©j√†
resampled_data = [
    {
        "Exited": int(y_resampled[i]),
        "CreditScore": float(X_resampled[i][0]),
        "Age": float(X_resampled[i][1]),
        "Balance": float(X_resampled[i][2]),
        "EstimatedSalary": float(X_resampled[i][3]),
        "Tenure": int(X_resampled[i][4]),
        "NumOfProducts": int(X_resampled[i][5]),
        "HasCrCard": int(X_resampled[i][6]),
        "IsActiveMember": int(X_resampled[i][7]),
        "Geography_Index": int(X_resampled[i][8]),
        "Gender_Index": int(X_resampled[i][9]),
    }
    for i in range(len(y_resampled))
]

df = pd.DataFrame(resampled_data)

print(df.head())
print(df['Exited'].value_counts())


   Exited  CreditScore       Age    Balance  EstimatedSalary  Tenure  \
0       1        619.0  3.761200       0.00        101348.88       2   
1       0        608.0  3.737670   83807.86        112542.58       1   
2       1        502.0  3.761200  159660.80        113931.57       8   
3       0        699.0  3.688879       0.00         93826.63       1   
4       0        850.0  3.784190  125510.82         79084.10       2   

   NumOfProducts  HasCrCard  IsActiveMember  Geography_Index  Gender_Index  
0              1          1               1                0             1  
1              1          0               1                2             1  
2              3          1               0                0             1  
3              2          0               0                0             1  
4              1          1               1                2             1  
Exited
1    7963
0    7963
Name: count, dtype: int64


### 3Ô∏è‚É£ S√©lection et assemblage des features (VectorAssembler)

In [12]:
# S√©parer features et target

feature_cols = [
    'CreditScore', 'Age', 'Balance', 'EstimatedSalary',
    'Tenure', 'NumOfProducts', 'HasCrCard',
    'IsActiveMember', 'Geography_Index', 'Gender_Index'
]

X = df[feature_cols]
y = df['Exited']


In [13]:
# Split Train/Test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (12740, 10), Test shape: (3186, 10)


In [16]:
# Cr√©er le pipeline scikit-learn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(max_iter=1000, solver='saga'))
])


In [17]:
# Recherche de param√®tres (GridSearchCV)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'lr__C': [1/0.01, 1/0.05, 1/0.1, 1/0.2, 1/0.5],  # regParam = 1/C
    'lr__l1_ratio': [0.0, 0.3, 0.5, 0.7, 1.0],       # elasticNetParam
    'lr__penalty': ['elasticnet'],                    # utiliser ElasticNet
}

grid_search = GridSearchCV(
    estimator=pipeline_lr,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 25 candidates, totalling 75 fits


0,1,2
,estimator,Pipeline(step...ver='saga'))])
,param_grid,"{'lr__C': [100.0, 20.0, ...], 'lr__l1_ratio': [0.0, 0.3, ...], 'lr__penalty': ['elasticnet']}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'elasticnet'
,dual,False
,tol,0.0001
,C,2.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,1000


In [18]:
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(max_iter=1000, solver='saga'))
])


In [19]:
# Meilleur mod√®le et pr√©dictions
best_model = grid_search.best_estimator_

print("Meilleurs param√®tres :")
print(grid_search.best_params_)

y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]


Meilleurs param√®tres :
{'lr__C': 2.0, 'lr__l1_ratio': 0.0, 'lr__penalty': 'elasticnet'}


In [20]:
# √âvaluation du mod√®le

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

auc = roc_auc_score(y_test, y_prob)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("‚úÖ √âvaluation du mod√®le :")
print(f"AUC-ROC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)


‚úÖ √âvaluation du mod√®le :
AUC-ROC: 0.8337
Accuracy: 0.7505
Precision: 0.7430
Recall: 0.7659
F1-score: 0.7543

Confusion Matrix:
[[1171  422]
 [ 373 1220]]


In [21]:
# Sauvegarde du mod√®le

import joblib

model_path = "../models/best_lr_model.pkl"
joblib.dump(best_model, model_path)
print(f"‚úÖ Mod√®le sauvegard√© dans : {model_path}")


‚úÖ Mod√®le sauvegard√© dans : ../models/best_lr_model.pkl
