# S07 - Démonstration 2: Telco (complément sur le prétraitement)

Prétraitement [des données de désabonnement du service client](https://www.kaggle.com/blastchar/telco-customer-churn) pour la classification.

In [1]:
# Importation des librairies
import numpy as np
import pandas as pd

In [2]:
# Chargement des données 
# la description peut être trouvée sur https://www.kaggle.com/blastchar/telco-customer-churn
url = 'https://raw.githubusercontent.com/acedesci/scanalytics/master/data/Telco-Customer-Churn.csv'
customer_data = pd.read_csv(url)
customer_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Remplace les strings vides par NaN
customer_data = customer_data.replace(r'^\s*$', np.nan, regex=True)

# Supprime les lignes avec des valeurs manquantes
customer_data = customer_data.dropna()

# Conversion de TotalCharges en type numérique, après avoir enlevé les valeurs manquantes
customer_data['TotalCharges'] = pd.to_numeric(customer_data['TotalCharges'])
customer_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
# Remplacement de toutes les valeurs indiquant 'no X service' par 'No'
customer_data['MultipleLines'] = customer_data['MultipleLines'].replace({'No phone service' : 'No'})
customer_data['OnlineSecurity'] = customer_data['OnlineSecurity'].replace({'No internet service' : 'No'})
customer_data['OnlineBackup'] = customer_data['OnlineBackup'].replace({'No internet service' : 'No'})
customer_data['DeviceProtection'] = customer_data['DeviceProtection'].replace({'No internet service' : 'No'})
customer_data['TechSupport'] = customer_data['TechSupport'].replace({'No internet service' : 'No'})
customer_data['StreamingTV'] = customer_data['StreamingTV'].replace({'No internet service' : 'No'})
customer_data['StreamingMovies'] = customer_data['StreamingMovies'].replace({'No internet service' : 'No'})
customer_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
# Transformation des valeurs qualitatives en variables binaires avec k-1 catégories (drop_first=True)
columns_to_transform = customer_data.columns[1:]
customer_dummies = pd.get_dummies(customer_data[columns_to_transform], drop_first=True)
# customer_dummies.to_csv('Telco-Customer-Churn_dummies.csv', index=False)
customer_dummies.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_Fiber optic,...,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,1,29.85,29.85,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,34,56.95,1889.5,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,2,53.85,108.15,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,1
3,0,45,42.3,1840.75,1,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,0,2,70.7,151.65,0,0,0,1,0,1,...,0,0,0,0,0,1,0,1,0,1
