# Projet : Prédiction du Churn Client (Telco)

Notebook complet : nettoyage, EDA, modélisation et sauvegarde du modèle.

**Remarque :** place le fichier CSV `WA_Fn-UseC_-Telco-Customer-Churn.csv` dans `/mnt/data/data/` ou `./data/` avant d'exécuter.

In [None]:
# Setup - imports
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib
import matplotlib.pyplot as plt

# Ensure plots render (in Jupyter)
%matplotlib inline


In [None]:
# Load dataset (modify path if needed)
path = '/mnt/data/data/WA_Fn-UseC_-Telco-Customer-Churn.csv'
if not os.path.exists(path):
    raise FileNotFoundError(f"Dataset not found at {path}. Place the Telco CSV in that path before running.")
df = pd.read_csv(path)
df.shape

In [None]:
# Quick preview
df.head()

In [None]:
# Preprocessing
# Convert TotalCharges to numeric (coerce errors), strip whitespace
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
# Drop customerID as identifier
if 'customerID' in df.columns:
    df = df.drop(columns=['customerID'])

# Target encode
df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})

# Identify numerical and categorical
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
if 'Churn' in num_cols:
    num_cols.remove('Churn')
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

print('Numerical cols:', num_cols)
print('Categorical cols:', cat_cols)

# Impute numeric and encode categorical
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# For categorical encoding we'll use one-hot for simplicity
df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# One-hot encode categorical
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Final feature/target split
X = df.drop(columns=['Churn'])
y = df['Churn']
print('X shape:', X.shape, 'y distribution:', y.value_counts(normalize=True))

In [None]:
# EDA - Distribution and simple plots using matplotlib
plt.figure(figsize=(6,4))
plt.hist(y, bins=2)
plt.title('Distribution of Churn (0 = No, 1 = Yes)')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.show()

# Tenure vs Churn - mean churn by tenure bucket
df_temp = df.copy()
if 'tenure' in df_temp.columns:
    tenure = df_temp['tenure']
    bins = [0,3,6,12,24,48,72]
    df_temp['tenure_bin'] = pd.cut(tenure, bins=bins)
    churn_by_tenure = df_temp.groupby('tenure_bin')['Churn'].mean()
    churn_by_tenure.plot(kind='bar')
    plt.title('Churn rate by tenure bins')
    plt.xlabel('Tenure bin')
    plt.ylabel('Churn rate')
    plt.show()

In [None]:
# Modeling - train/test split and baseline models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardize numeric columns indices (approximate by checking dtype)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Baseline Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)
print('Logistic Regression - Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print('Random Forest - Accuracy:', accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

In [None]:
# Hyperparameter tuning for Random Forest (GridSearchCV)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=cv, scoring='f1', n_jobs=-1)
gsearch.fit(X_train, y_train)
print('Best params:', gsearch.best_params_)
best_rf = gsearch.best_estimator_
y_pred_best = best_rf.predict(X_test)
print('Tuned RF - Accuracy:', accuracy_score(y_test, y_pred_best))
print('Tuned RF - F1:', f1_score(y_test, y_pred_best))
print('ROC AUC:', roc_auc_score(y_test, best_rf.predict_proba(X_test)[:,1]))

In [None]:
# Save the best model and scaler
out_model_path = '/mnt/data/project_churn_output/best_model.joblib'
out_scaler_path = '/mnt/data/project_churn_output/scaler.joblib'
joblib.dump(best_rf, out_model_path)
joblib.dump(scaler, out_scaler_path)
print('Saved model to', out_model_path)
print('Saved scaler to', out_scaler_path)