# 02 - Préparation et nettoyage des données

Ce notebook traite le nettoyage, la gestion des valeurs manquantes, l'encodage et le feature engineering.

In [1]:
import pandas as pd
import numpy as np

## Chargement des données brutes

In [2]:
df = pd.read_csv('../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Nettoyage des valeurs manquantes et des types

In [3]:
# Exemple de gestion des valeurs manquantes
df = df.replace(' ', np.nan)
df = df.dropna()  # À adapter selon le contexte
# Correction des types
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

## Encodage des variables catégorielles

In [4]:
# Encodage binaire
if 'Churn' in df.columns:
    df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
# Encodage one-hot
df = pd.get_dummies(df, drop_first=True)

## Feature engineering (exemples)

In [5]:
# Exemple : création d'une variable d'ancienneté
if 'tenure' in df.columns:
    df['tenure_group'] = pd.cut(df['tenure'], bins=[0, 12, 24, 48, 60, 72], labels=['0-12','12-24','24-48','48-60','60-72'])
    df = pd.get_dummies(df, columns=['tenure_group'], drop_first=True)

## Sauvegarde des données prétraitées

In [6]:
df.to_csv('../data/processed/telco_churn_clean.csv', index=False)