# Pr√©diction de l'Attrition Client Bancaire : Pipeline de Machine Learning Distribu√© avec Apache Spark

* imports : 

In [None]:


from pyspark.sql import SparkSession
from pymongo import MongoClient
import pyspark
import pandas as pd
import os
import sys
import numpy as np

from pyspark import StorageLevel
from pyspark.sql import functions as F
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from imblearn.over_sampling import SMOTE
import numpy as np
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler




## üü¢ Construction du Pipeline de Machine Learning

### 1Ô∏è‚É£ R√©cup√©ration des donn√©es pr√©trait√©es

In [50]:

# Configuration des variables d'environnement Python
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


In [51]:


print("üîç V√©rification de la version PySpark...")
print(f"Version PySpark: {pyspark.__version__}")



üîç V√©rification de la version PySpark...
Version PySpark: 3.5.7


In [52]:


# Cr√©er une session Spark simple (sans MongoDB JAR)
spark = SparkSession.builder \
    .appName("MongoDB-PySpark-PyMongo") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.python.worker.timeout", "600") \
    .master("local[*]") \
    .getOrCreate()


In [53]:

# Connexion √† MongoDB avec PyMongo
print("\nüîå Connexion √† MongoDB...")
client = MongoClient("mongodb://localhost:27017/")
db = client["Attrition_Client_Bancaire_db"]
collection = db["clients_pretraite"]



üîå Connexion √† MongoDB...


In [54]:

# Compter les documents
total_docs = collection.count_documents({})
print(f"üìä Total documents dans MongoDB : {total_docs}")

# Lire les donn√©es
data = list(collection.find())

# Convertir en DataFrame Pandas
df_pandas = pd.DataFrame(data)

display(df_pandas.head(5))


üìä Total documents dans MongoDB : 10000


Unnamed: 0,_id,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Index,Gender_Index
0,690cfee3dd8b13fbc893bfd9,619.0,3.7612,2,0.0,1,1,1,101348.88,1,0,1
1,690cfee3dd8b13fbc893bfda,608.0,3.73767,1,83807.86,1,0,1,112542.58,0,2,1
2,690cfee3dd8b13fbc893bfdb,502.0,3.7612,8,159660.8,3,1,0,113931.57,1,0,1
3,690cfee3dd8b13fbc893bfdc,699.0,3.688879,1,0.0,2,0,0,93826.63,0,0,1
4,690cfee3dd8b13fbc893bfdd,850.0,3.78419,2,125510.82,1,1,1,79084.1,0,2,1


In [55]:

# Supprimer le champ _id 
if '_id' in df_pandas.columns:
    df_pandas = df_pandas.drop('_id', axis=1)

print(f"‚úÖ Donn√©es charg√©es dans Pandas : {len(df_pandas)} lignes")
print(f"üìã Colonnes : {list(df_pandas.columns)}")


‚úÖ Donn√©es charg√©es dans Pandas : 10000 lignes
üìã Colonnes : ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited', 'Geography_Index', 'Gender_Index']


In [56]:

# Convertir en DataFrame Spark
df = spark.createDataFrame(df_pandas)

print("\n‚úÖ Donn√©es converties en Spark DataFrame :")
df.printSchema()
df.show(5)

print(f"\nüìä Nombre total de lignes : {df.count()}")

# Fermer la connexion MongoDB
client.close()


‚úÖ Donn√©es converties en Spark DataFrame :
root
 |-- CreditScore: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Tenure: long (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: long (nullable = true)
 |-- HasCrCard: long (nullable = true)
 |-- IsActiveMember: long (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: long (nullable = true)
 |-- Geography_Index: long (nullable = true)
 |-- Gender_Index: long (nullable = true)

+-----------+------------------+------+---------+-------------+---------+--------------+---------------+------+---------------+------------+
|CreditScore|               Age|Tenure|  Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|Geography_Index|Gender_Index|
+-----------+------------------+------+---------+-------------+---------+--------------+---------------+------+---------------+------------+
|      619.0|3.7612001156935624|     2|      0.0|            1|        1|   

In [57]:
df.count()

10000

### 2Ô∏è‚É£ Gestion du d√©s√©quilibre de classes (undersampling / SMOTE)

In [58]:

data = df.select("Exited", "CreditScore", "Age", "Balance", "EstimatedSalary",
                 "Tenure", "NumOfProducts", "HasCrCard", "IsActiveMember").collect()

# display(data)


X = np.array([[row["CreditScore"], row["Age"], row["Balance"], row["EstimatedSalary"],
               row["Tenure"], row["NumOfProducts"], row["HasCrCard"], row["IsActiveMember"]]
              for row in data])

y = np.array([row["Exited"] for row in data])

print("Avant SMOTE :")
print("Classe 0 :", sum(y == 0))
print("Classe 1 :", sum(y == 1))

Avant SMOTE :
Classe 0 : 7963
Classe 1 : 2037


In [59]:

smote = SMOTE(random_state=42, sampling_strategy="auto")
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\nApr√®s SMOTE :")
unique, counts = np.unique(y_resampled, return_counts=True)
for cls, cnt in zip(unique, counts):
    print(f"Classe {cls} : {cnt}")



Apr√®s SMOTE :
Classe 0 : 7963
Classe 1 : 7963


In [None]:
# Create the resampled data with proper column names
resampled_data = [
    Row(
        Exited=int(y_resampled[i]),
        CreditScore=float(X_resampled[i][0]),
        Age=float(X_resampled[i][1]),
        Balance=float(X_resampled[i][2]),
        EstimatedSalary=float(X_resampled[i][3]),
        Tenure=float(X_resampled[i][4]),
        NumOfProducts=float(X_resampled[i][5]),
        HasCrCard=float(X_resampled[i][6]),
        IsActiveMember=float(X_resampled[i][7])
    )
    for i in range(len(y_resampled))
]

# Create pandas DataFrame
df_pd = pd.DataFrame(resampled_data)

# Create Spark DataFrame directly from the list of Row objects
df_smote = spark.createDataFrame(resampled_data)


print(f"Total count: {df_smote.count()}")

print("\n‚úÖ Donn√©es √©quilibr√©es :")
df_smote.groupBy("Exited").count().show()

print("\nüìà Statistiques descriptives :")
df_smote.describe().show()

Total count: 15926

‚úÖ Donn√©es √©quilibr√©es :
+------+-----+
|Exited|count|
+------+-----+
|     0| 7963|
|     1| 7963|
+------+-----+


üìà Statistiques descriptives :
+-------+------------------+-----------------+-------------------+-----------------+------------------+-----------------+------------------+-------------------+-------------------+
|summary|            Exited|      CreditScore|                Age|          Balance|   EstimatedSalary|           Tenure|     NumOfProducts|          HasCrCard|     IsActiveMember|
+-------+------------------+-----------------+-------------------+-----------------+------------------+-----------------+------------------+-------------------+-------------------+
|  count|             15926|            15926|              15926|            15926|             15926|            15926|             15926|              15926|              15926|
|   mean|               0.5|649.5956485945403| 3.7094709363406992|81920.97404269397|100526.31316938631

In [61]:
df_smote.columns

['Exited',
 'CreditScore',
 'Age',
 'Balance',
 'EstimatedSalary',
 'Tenure',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember']

### 3Ô∏è‚É£ S√©lection et assemblage des features (VectorAssembler)

In [62]:
feature_cols = [
 'CreditScore',
 'Age',
 'Balance',
 'EstimatedSalary',
 'Tenure',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember'
]


assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="assembled_features"
)


df_vector = assembler.transform(df_smote)


df_vector.select("assembled_features", "Exited").show(5, truncate=False)


df_vector.printSchema()


+-------------------------------------------------------------+------+
|assembled_features                                           |Exited|
+-------------------------------------------------------------+------+
|[619.0,3.7612001156935624,0.0,101348.88,2.0,1.0,1.0,1.0]     |1     |
|[608.0,3.7376696182833684,83807.86,112542.58,1.0,1.0,0.0,1.0]|0     |
|[502.0,3.7612001156935624,159660.8,113931.57,8.0,3.0,1.0,0.0]|1     |
|[699.0,3.6888794541139363,0.0,93826.63,1.0,2.0,0.0,0.0]      |0     |
|[850.0,3.784189633918261,125510.82,79084.1,2.0,1.0,1.0,1.0]  |0     |
+-------------------------------------------------------------+------+
only showing top 5 rows

root
 |-- Exited: long (nullable = true)
 |-- CreditScore: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Balance: double (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Tenure: double (nullable = true)
 |-- NumOfProducts: double (nullable = true)
 |-- HasCrCard: double (nullable = true)
 |-- Is

### 4Ô∏è‚É£ Normalisation des features (StandardScaler ou MinMaxScaler)


In [None]:

# Cr√©ation du StandardScaler
scaler = StandardScaler(
    inputCol="assembled_features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

scaler_model = scaler.fit(df_vector)
df_scaled = scaler_model.transform(df_vector)

df_scaled.select("assembled_features", "scaled_features").show(3)


+--------------------+--------------------+
|  assembled_features|     scaled_features|
+--------------------+--------------------+
|[619.0,3.76120011...|[-0.3361769558682...|
|[608.0,3.73766961...|[-0.4570420685369...|
|[502.0,3.76120011...|[-1.6217422451625...|
+--------------------+--------------------+
only showing top 3 rows

