In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import os

# Importation de la base de données MEMBERS


In [2]:
df_membre = pd.read_csv("DATA_ORIGINAL/MEMBERS_DIM.csv")

In [3]:
df_membre.head()

Unnamed: 0,MEMBER_ID,AGE,GENDER,LANGUAGE,TIER,SMALL_BUSINESS_FLAG,TENURE_MONTHS,PROV,CASH_BACK_POINTS_BALANCE,REWARD_POINTS_BALANCE,EMAILABLE_FLAG,MAILABLE_FLAG
0,859615,80.0,M,E,Basic,N,362,ON,14,636,N,Y
1,12684534,60.0,F,E,Basic,N,256,MB,0,419,N,Y
2,14130778,47.0,F,F,Basic,N,236,QC,116,195,Y,Y
3,18899440,39.0,,F,Basic,N,185,QC,0,7,N,Y
4,1011315776,,,E,Basic,N,47,,0,0,,N


In [4]:
df_membre.shape

(1202985, 12)

In [5]:
# lignes dupliquées
df_membre.duplicated().sum()

0

In [6]:
# supprimer la variable TIER car c'est une segmentation fait par Airmilles
df_membre.drop('TIER', axis=1, inplace=True)

# Analyse exploratoire de la Base de Donnees

In [7]:
# pourcentage de valeurs manquantes de chaque variable
df_membre.isnull().sum() / df_membre.shape[0] * 100

MEMBER_ID                    0.000000
AGE                         14.547230
GENDER                      33.274646
LANGUAGE                     0.012552
SMALL_BUSINESS_FLAG          0.000000
TENURE_MONTHS                0.000000
PROV                         6.140642
CASH_BACK_POINTS_BALANCE     0.000000
REWARD_POINTS_BALANCE        0.000000
EMAILABLE_FLAG              20.047299
MAILABLE_FLAG                0.000000
dtype: float64

In [8]:
# type de donnée de chaque variable 
df_membre.dtypes

MEMBER_ID                     int64
AGE                         float64
GENDER                       object
LANGUAGE                     object
SMALL_BUSINESS_FLAG          object
TENURE_MONTHS                 int64
PROV                         object
CASH_BACK_POINTS_BALANCE      int64
REWARD_POINTS_BALANCE         int64
EMAILABLE_FLAG               object
MAILABLE_FLAG                object
dtype: object

In [9]:
pd.set_option('float_format', '{:f}'.format)

In [10]:
# statistique descriptives variables numériques
df_membre.drop('MEMBER_ID', axis=1).describe()

Unnamed: 0,AGE,TENURE_MONTHS,CASH_BACK_POINTS_BALANCE,REWARD_POINTS_BALANCE
count,1027984.0,1202985.0,1202985.0,1202985.0
mean,52.630494,189.669142,113.480202,1764.888851
std,22.4505,112.615777,804.555146,5189.435648
min,-6101.0,0.0,-17080.0,-16457.0
25%,39.0,83.0,0.0,0.0
50%,53.0,195.0,0.0,290.0
75%,66.0,292.0,66.0,1657.0
max,2021.0,372.0,157002.0,982394.0


In [11]:
mode = df_membre.drop('MEMBER_ID', axis=1).mode(numeric_only=True).loc[0]
mode

AGE                         59.000000
TENURE_MONTHS              304.000000
CASH_BACK_POINTS_BALANCE     0.000000
REWARD_POINTS_BALANCE        0.000000
Name: 0, dtype: float64

In [12]:
# statistique descriptives variables caractères
df_membre.describe(include=[object])

Unnamed: 0,GENDER,LANGUAGE,SMALL_BUSINESS_FLAG,PROV,EMAILABLE_FLAG,MAILABLE_FLAG
count,802696,1202834,1202985,1129114,961819,1202985
unique,2,2,1,13,2,2
top,F,E,N,ON,Y,Y
freq,478506,994936,1202985,420382,489998,1060413



## <span style='background :yellow' > Variable MEMBER_ID </span>

In [13]:
#chaque ligne représente un client unique
df_membre.MEMBER_ID.nunique()== df_membre.shape[0]


True

## <span style='background :yellow' > Variable AGE </span>

[liste de supercentenaires ](https://en.wikipedia.org/wiki/List_of_Canadian_supercentenarians)

[Age minimun ](https://www.airmiles.ca/en/terms-and-conditions.html) 

In [14]:
#age max au canada d'un human 117
#age min pour detenir un carte airmiles est de 16
df_membre[(df_membre['AGE']<16) | (df_membre["AGE"]>117)]

Unnamed: 0,MEMBER_ID,AGE,GENDER,LANGUAGE,SMALL_BUSINESS_FLAG,TENURE_MONTHS,PROV,CASH_BACK_POINTS_BALANCE,REWARD_POINTS_BALANCE,EMAILABLE_FLAG,MAILABLE_FLAG
2108,1010915155,131.000000,F,E,N,84,MB,0,181,Y,Y
6618,18753959,15.000000,,E,N,183,ON,0,0,N,Y
8596,1000833768,138.000000,,E,N,175,ON,0,1993,N,Y
12922,1002256082,122.000000,,E,N,158,AB,0,6833,N,N
18666,1003132105,12.000000,,E,N,151,ON,0,2434,N,Y
...,...,...,...,...,...,...,...,...,...,...,...
1192665,13186221,122.000000,,E,N,247,ON,0,8,Y,Y
1193285,4333212,123.000000,F,E,N,303,ON,495,857,N,Y
1193624,11787359,140.000000,,E,N,257,ON,22,2,N,Y
1196459,1016138760,2021.000000,,E,N,42,ON,0,2001,N,Y


In [15]:
#nulls ages
nb_null_age =df_membre.AGE.isnull().sum()
percent_null_age = df_membre.AGE.isnull().sum() / df_membre.shape[0] * 100


print('nb de membres avec age manquante: ',nb_null_age)
print('pourcentage de la bae de donnees avec age manquante: ',percent_null_age)


nb de membres avec age manquante:  175001
pourcentage de la bae de donnees avec age manquante:  14.547230430969629


In [16]:
#comme AGE est une variable socio demographique, on ne va pas l'utiliser dans la segmentation
#sera effacée dans le fichier de segmentation avant la segmentation manuelle et segmetation kmeans
#df_membre.drop('AGE', axis=1, inplace=True)

## <span style='background :yellow' > Variable GENDER </span>

In [17]:
df_membre.GENDER.value_counts(dropna=False)

F      478506
NaN    400289
M      324190
Name: GENDER, dtype: int64

In [18]:
#comme GENDER est une variable socio demographique, on ne va pas l'utiliser dans la segmentation
#sera effacée dans le fichier de segmentation avant la segmentation manuelle et segmetation kmeans
#df_membre.drop('GENDER', axis=1, inplace=True)

## <span style='background :yellow' > Variable LANGUAGE </span>

In [19]:
df_membre.LANGUAGE.value_counts(dropna=False)

E      994936
F      207898
NaN       151
Name: LANGUAGE, dtype: int64

In [20]:
#comme LANGAGE est une variable socio demographique, on ne va pas l'utiliser dans la segmentation
df_membre.drop('LANGUAGE', axis=1, inplace=True)

## <span style='background :yellow' > Variable SMALL_BUSINESS_FLAG </span>

In [21]:
df_membre.SMALL_BUSINESS_FLAG.value_counts(dropna=False)

N    1202985
Name: SMALL_BUSINESS_FLAG, dtype: int64

In [22]:
# Variable SMALL_BUSINESS_FLAG n'apporte aucune information, on ne va pas l'utiliser dans la segmentation
df_membre.drop('SMALL_BUSINESS_FLAG', axis=1, inplace=True)

## <span style='background :yellow' > Variable TENURE_MONTHS </span>

In [23]:
min_TM = df_membre['TENURE_MONTHS'].min()
max_TM = df_membre['TENURE_MONTHS'].max()

print('min : ',min_TM)
print('max : ',max_TM)
print('Air Miles a ete Lancé au Canada en 1992')
print('anciennete en mois airmiles: ',(2023-1992)*12)

min :  0
max :  372
Air Miles a ete Lancé au Canada en 1992
anciennete en mois airmiles:  372


In [24]:
df_membre.TENURE_MONTHS.value_counts(dropna=False)

304    13387
254    12837
369    12637
368    12467
299    12258
       ...  
371        2
9          1
7          1
2          1
372        1
Name: TENURE_MONTHS, Length: 370, dtype: int64

## <span style='background :yellow' > Variable PROV </span>


In [25]:
df_membre.PROV.value_counts(dropna=False) / df_membre.shape[0] * 100 

ON    34.944908
QC    20.933594
AB    11.687511
BC    10.265880
NaN    6.140642
NS     4.014015
MB     3.809441
NB     2.873103
SK     2.644089
NL     2.022220
PE     0.554038
NT     0.054697
YT     0.040981
NU     0.014880
Name: PROV, dtype: float64

In [26]:
#comme PROVINCE est une variable socio demographique, on ne va pas l'utiliser dans la segmentation
#sera effacée dans le fichier de segmentation avant la segmentation manuelle et segmetation kmeans
#df_membre.drop('PROV', axis=1, inplace=True)

## <span style='background :yellow' > Variable CASH_BACK_POINTS_BALANCE </span>

Nombre de points actuellement dans le compte pour des remises en argent.


In [27]:
min_CBPB =df_membre["CASH_BACK_POINTS_BALANCE"].min()
max_CBPB =df_membre["CASH_BACK_POINTS_BALANCE"].max()
nb_CBPB_neg = df_membre[df_membre["CASH_BACK_POINTS_BALANCE"] < 0].shape[0]
print('min CASH_BACK_POINTS_BALANCE: ',min_CBPB)
print('max CASH_BACK_POINTS_BALANCE: ',max_CBPB)
print('Nombre de lignes avec des balance de points cash back négatives: ',nb_CBPB_neg)

min CASH_BACK_POINTS_BALANCE:  -17080
max CASH_BACK_POINTS_BALANCE:  157002
Nombre de lignes avec des balance de points cash back négatives:  64


## <span style='background :yellow' > Variable REWARD_POINTS_BALANCE </span>

In [28]:
min_RPB = df_membre['REWARD_POINTS_BALANCE'].min()
max_RPB = df_membre['REWARD_POINTS_BALANCE'].max()
nb_RPB_neg = df_membre[df_membre['REWARD_POINTS_BALANCE']<0].shape[0]
print('min REWARD_POINTS_BALANCE: ',min_RPB)
print('max REWARD_POINTS_BALANCE: ',max_RPB)
print('Nombre de lignes avec des balance de points reward négatives: ',nb_RPB_neg)

min REWARD_POINTS_BALANCE:  -16457
max REWARD_POINTS_BALANCE:  982394
Nombre de lignes avec des balance de points reward négatives:  50


## <span style='background :yellow' > Variable EMAILABLE_FLAG </span>

In [29]:
df_membre.EMAILABLE_FLAG.value_counts(dropna=False)

Y      489998
N      471821
NaN    241166
Name: EMAILABLE_FLAG, dtype: int64

In [30]:
#on ne va pas l'utiliser dans la segmentation mais elle peut etre utilse pour le mandat 
df_membre.drop('EMAILABLE_FLAG', axis=1, inplace=True)

## <span style='background :yellow' > Variable MAILABLE_FLAG </span>

In [31]:
df_membre.MAILABLE_FLAG.value_counts(dropna=False)

Y    1060413
N     142572
Name: MAILABLE_FLAG, dtype: int64

In [32]:
#on ne va pas l'utiliser dans la segmentation mais elle peut etre utilse pour le mandat 
df_membre.drop('MAILABLE_FLAG', axis=1, inplace=True)

In [33]:
df_membre.isnull().sum()

MEMBER_ID                        0
AGE                         175001
GENDER                      400289
TENURE_MONTHS                    0
PROV                         73871
CASH_BACK_POINTS_BALANCE         0
REWARD_POINTS_BALANCE            0
dtype: int64

In [34]:
df_membre.dtypes

MEMBER_ID                     int64
AGE                         float64
GENDER                       object
TENURE_MONTHS                 int64
PROV                         object
CASH_BACK_POINTS_BALANCE      int64
REWARD_POINTS_BALANCE         int64
dtype: object

In [35]:
df_membre

Unnamed: 0,MEMBER_ID,AGE,GENDER,TENURE_MONTHS,PROV,CASH_BACK_POINTS_BALANCE,REWARD_POINTS_BALANCE
0,859615,80.000000,M,362,ON,14,636
1,12684534,60.000000,F,256,MB,0,419
2,14130778,47.000000,F,236,QC,116,195
3,18899440,39.000000,,185,QC,0,7
4,1011315776,,,47,,0,0
...,...,...,...,...,...,...,...
1202980,1982154,67.000000,F,288,ON,0,1002
1202981,5806235,62.000000,F,351,QC,186,331
1202982,17171682,62.000000,,209,ON,0,1828
1202983,18475293,33.000000,,184,ON,14,932


In [36]:
output_to_csv = True


In [37]:
if output_to_csv:
    df_membre.to_csv("DATA_PRETRAITEMENT/df_membre.csv", index=False)