In [53]:
# Data processing
import pandas as pd
import sklearn

# Model preprocessing
from sklearn.model_selection import train_test_split

# Over sampling and under sampling
from sklearn.utils import resample

# Normalize
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [54]:
# Chemins des fichiers CSV
feature_file_path = '../../eda/Output/feature_clean.csv'
label_file_path = '../../eda/Output/labels_clean.csv'

# Chargement des données d'entraînement et de test dans des DataFrames pandas
feature = pd.read_csv(feature_file_path)
label = pd.read_csv(label_file_path)

# Affichage des cinq premières lignes du jeu de données d'entraînement
print("feature:")
feature.head()


feature:


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,...,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
0,0,0,1,0,0,1,1,0,0,0,...,0,0,0,0,0,1,0,1,29.85,5.463515
1,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,34,56.95,43.468379
2,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,2,53.85,10.399519
3,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,45,42.3,42.903963
4,0,0,0,0,1,1,0,0,1,0,...,0,0,0,0,0,1,0,2,70.7,12.314625


In [55]:
# Données d'entrainement (60%), test (20%) et validation (20%)
X = feature
Y = label

seed = 42

# Données d'entrainement (60%)
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=seed, test_size=0.4, stratify = Y)

# Données de validation(20%)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, random_state=seed, test_size=0.5, stratify = y_test)

# Affichage de la taille des données divisés
print("x_train taille :", x_train.shape)
print("y_train taille :", y_train.shape, '\n')
print("x_val taille :", x_val.shape)
print("y_val taille :", y_val.shape, '\n')
print("x_test taille :", x_test.shape)
print("y_test taille :", y_test.shape)

x_train taille : (4219, 30)
y_train taille : (4219, 1) 

x_val taille : (1406, 30)
y_val taille : (1406, 1) 

x_test taille : (1407, 30)
y_test taille : (1407, 1)


In [56]:
# Fréquence de 0 et 1 dans churn
print("frequence y :", pd.DataFrame(Y).value_counts(normalize = True), "\n")
print("frequence y_train :" , pd.DataFrame(y_train).value_counts(normalize = True), "\n")
print("frequence y_test :", pd.DataFrame(y_test).value_counts(normalize = True), "\n")
print("frequence y_val :", pd.DataFrame(y_val).value_counts(normalize = True))


frequence y : Churn
0        0.734215
1        0.265785
dtype: float64 

frequence y_train : Churn
0        0.734297
1        0.265703
dtype: float64 

frequence y_test : Churn
0        0.734186
1        0.265814
dtype: float64 

frequence y_val : Churn
0        0.733997
1        0.266003
dtype: float64


### Résolution du problème de désequilibre de classe

In [57]:
# Méthode de sur_échantillonnage de la classe  minoritaitre
X2 = x_train
X2['Churn'] = y_train.values

# Definition des classes
minority = X2[X2['Churn']==1] 
majority = X2[X2['Churn']==0]

#### Sur-echantillonnage

In [58]:
# Augmentation des observations de la classe minoritaire
minority_upsampled = resample(minority, replace = True, n_samples = len(majority), random_state = seed)

# Concatenation de la classe minoritaire et majoritaire
upsample = pd.concat([majority, minority_upsampled])

# Affichage des classes de la DataFrame sur_échantillonées
upsample['Churn'].value_counts()

0    3098
1    3098
Name: Churn, dtype: int64

In [59]:
# Données d'entrainement sur la base de la méthode de sur_échantillonnage
x_train_up = upsample.drop('Churn', axis = 1)
y_train_up = upsample['Churn']

#### Sous-echantillonnage

In [60]:
# Diminution des observations de la classe majoritaire
majority_downsampled = resample(majority, replace = True, n_samples = len(minority), random_state = seed)

# Concatenation de la classe minoritaire et majoritaire
downsample = pd.concat([minority, majority_downsampled])

# Affichage des classes de la DataFrame sous_échantillonées
downsample['Churn'].value_counts()

1    1121
0    1121
Name: Churn, dtype: int64

In [61]:
# Données d'entrainement sur la base de la méthode de sous_échantillonnage
x_train_down = downsample.drop('Churn', axis = 1)
y_train_down = downsample['Churn']

In [62]:
# Data copy
train_feature_up = x_train_up
train_labels_up = y_train_up

train_feature_down = x_train_down
train_labels_down = y_train_down

train_feature = x_train.drop("Churn", axis=1)
train_labels = y_train

test_feature = x_test
test_labels = y_test

val_feature = x_val
val_labels = y_val

### Normalisation

In [63]:
# Pipeline de normalisation
normalization_pipeline = Pipeline([
    ('scaler', MinMaxScaler())
])

train_feature_normalized = normalization_pipeline.fit_transform(train_feature)

val_feature_normalized = normalization_pipeline.transform(val_feature)

test_feature_normalized = normalization_pipeline.transform(test_feature)

# Retransformation en DataFrame
train_feature = pd.DataFrame(train_feature_normalized, columns=train_feature.columns)
val_feature   = pd.DataFrame(val_feature_normalized, columns=val_feature.columns)
test_feature  = pd.DataFrame(test_feature_normalized, columns=test_feature.columns)


train_feature_up_normalized = normalization_pipeline.fit_transform(train_feature_up)
train_feature_up = pd.DataFrame(train_feature_up_normalized, columns=train_feature_up.columns)

train_feature_down_normalized = normalization_pipeline.fit_transform(train_feature_down)
train_feature_down = pd.DataFrame(train_feature_down_normalized, columns=train_feature_down.columns)

In [64]:
# Data sauvegarde
train_feature_up.to_csv('../Output/train_feature_up.csv', index=False)
train_labels_up.to_csv('../Output/train_labels_up.csv', index=False)

train_feature_down.to_csv('../Output/train_feature_down.csv', index=False)
train_labels_down.to_csv('../Output/train_labels_down.csv', index=False)

train_feature.to_csv('../Output/train_feature.csv', index=False)
train_labels.to_csv('../Output/train_labels.csv', index=False)

test_feature.to_csv('../Output/test_feature.csv', index=False)
test_labels.to_csv('../Output/test_labels.csv', index=False)

val_feature.to_csv('../Output/val_feature.csv', index=False)
val_labels.to_csv('../Output/val_labels.csv', index=False)

In [65]:
# Verifions si toute les valeurs sont entre 0 et 1
train_feature.describe()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,...,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
count,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,...,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0,4219.0
mean,0.502963,0.0,0.481868,0.297701,0.904717,0.593269,0.095283,0.424034,0.440152,0.218298,...,0.218298,0.38374,0.204314,0.247689,0.21735,0.333728,0.227305,0.44367,0.463144,0.41372
std,0.50005,0.0,0.49973,0.457301,0.293641,0.491282,0.293641,0.494254,0.496464,0.41314,...,0.41314,0.486354,0.403247,0.431721,0.412492,0.4716,0.419141,0.345334,0.302463,0.276951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.112676,0.166667,0.176651
50%,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.394366,0.51996,0.370666
75%,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.774648,0.71507,0.647234
max,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
