In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import os
from datetime import datetime
from joblib import dump, load
from functools import reduce

# Importation de la base de données REWARD_FACT


In [2]:
#precisser que MONTH_ID est un datetime au moment de l'importation
df_reward = pd.read_csv("DATA_ORIGINAL/REWARD_FACT.csv",
                        parse_dates=['MONTH_ID'],
                       date_parser=lambda x : datetime.strptime(x, "%Y%m"))

In [3]:
df_reward.head()

Unnamed: 0,MEMBER_ID,MONTH_ID,REWARDS_CATEGORY,REDEMPTIONS,NUMBER_ITEMS_REDEEMED,POINTS_REDEEMED
0,5885640,2020-02-01,Cash_Back,1,2,200
1,5885640,2019-04-01,Cash_Back,1,1,100
2,5885640,2020-07-01,Cash_Back,1,1,100
3,5889000,2020-09-01,Cash_Back,1,1,100
4,5889000,2021-03-01,Cash_Back,1,1,100


In [4]:
df_reward.shape

(1896115, 6)

In [5]:
# lignes dupliquées
df_reward.duplicated().sum()

0

# Analyse exploratoire de la Base de Donnees

In [6]:
# pourcentage de valeurs manquantes de chaque variable
df_reward.isnull().sum() / df_reward.shape[0] * 100

MEMBER_ID                0.0
MONTH_ID                 0.0
REWARDS_CATEGORY         0.0
REDEMPTIONS              0.0
NUMBER_ITEMS_REDEEMED    0.0
POINTS_REDEEMED          0.0
dtype: float64

In [7]:
# type de donnée de chaque variable 
df_reward.dtypes

MEMBER_ID                         int64
MONTH_ID                 datetime64[ns]
REWARDS_CATEGORY                 object
REDEMPTIONS                       int64
NUMBER_ITEMS_REDEEMED             int64
POINTS_REDEEMED                   int64
dtype: object

In [8]:
pd.set_option('float_format', '{:f}'.format)

In [9]:
# statistique descriptives variables numériques
df_reward.drop('MEMBER_ID', axis=1).describe()

Unnamed: 0,REDEMPTIONS,NUMBER_ITEMS_REDEEMED,POINTS_REDEEMED
count,1896115.0,1896115.0,1896115.0
mean,1.209191,2.135465,378.649319
std,0.672475,3.602915,1104.658257
min,1.0,1.0,0.0
25%,1.0,1.0,100.0
50%,1.0,1.0,100.0
75%,1.0,2.0,200.0
max,62.0,818.0,107992.0


In [10]:
mode = df_reward.drop('MEMBER_ID', axis=1).mode(numeric_only=True).loc[0]
mode

REDEMPTIONS                1
NUMBER_ITEMS_REDEEMED      1
POINTS_REDEEMED          100
Name: 0, dtype: int64

In [11]:
# statistique descriptives variables caractères
df_reward.describe(include=[object])

Unnamed: 0,REWARDS_CATEGORY
count,1896115
unique,3
top,Cash_Back
freq,1791124


## <span style='background :yellow' > Variable MEMBER_ID </span>

In [12]:
#cette base de données ne conserve pas tous les membres, uniquement les membre qui ont utilisé leurs points
#sur un periode 3 ans 
nb_member_uniques= df_reward.MEMBER_ID.nunique()
print("On a de l'information sur:" , nb_member_uniques ,'clients')

On a de l'information sur: 399254 clients


## <span style='background :yellow' > Variable MONTH_ID </span>

In [13]:
 df_reward['MONTH_ID'] = df_reward["MONTH_ID"].dt.to_period("M")

In [14]:
df_reward.head()

Unnamed: 0,MEMBER_ID,MONTH_ID,REWARDS_CATEGORY,REDEMPTIONS,NUMBER_ITEMS_REDEEMED,POINTS_REDEEMED
0,5885640,2020-02,Cash_Back,1,2,200
1,5885640,2019-04,Cash_Back,1,1,100
2,5885640,2020-07,Cash_Back,1,1,100
3,5889000,2020-09,Cash_Back,1,1,100
4,5889000,2021-03,Cash_Back,1,1,100


In [15]:
df_reward["MONTH_ID"].value_counts(dropna=False)

2020-12    82957
2019-12    78804
2019-06    72518
2019-05    71108
2019-07    68763
2020-01    63241
2020-10    62716
2019-11    62566
2020-11    62170
2020-06    61538
2019-10    61493
2019-01    61445
2019-03    61236
2021-04    61224
2019-08    61019
2021-03    60750
2020-07    60602
2020-08    60584
2019-09    59893
2021-05    59623
2020-02    59085
2020-09    58681
2021-01    56861
2020-03    55293
2021-02    55229
2019-04    52297
2019-02    51267
2020-05    51246
2021-06    48581
2020-04    40630
2021-07    32172
2021-08    11303
2021-09    10313
2021-10     9708
2021-11     5201
2021-12     3998
Freq: M, Name: MONTH_ID, dtype: int64

## <span style='background :yellow' > Variable REWARDS_CATEGORY </span>

In [16]:
df_reward.REWARDS_CATEGORY.value_counts()

Cash_Back            1791124
REWARD_CATEGORY_2      92376
REWARD_CATEGORY_3      12615
Name: REWARDS_CATEGORY, dtype: int64

In [17]:
# variable ne varie pas assez, on ne va pas l'utiliser dans la segmentation 
df_reward.REWARDS_CATEGORY.value_counts() / df_reward.shape[0] * 100

Cash_Back           94.462836
REWARD_CATEGORY_2    4.871856
REWARD_CATEGORY_3    0.665308
Name: REWARDS_CATEGORY, dtype: float64

In [18]:
df_reward.drop('REWARDS_CATEGORY', axis=1, inplace=True)

In [19]:
 #df_reward['REWARDS_CATEGORY'] = np.where(df_reward.REWARDS_CATEGORY != 'Cash_Back', 'REWARD_CATEGORIES', df_reward.REWARDS_CATEGORY)

In [20]:
 #df_reward.REWARDS_CATEGORY.value_counts() / df_reward.shape[0] * 100

## <span style='background :yellow' > Variable REDEMPTIONS </span>

Nombre de redemptions pour le mois et la catégorie

ex: le membre s'est presente 5 fois a la caisse pour avoir de Cash_Back

In [21]:
df_reward.REDEMPTIONS.unique()

array([ 1,  2,  3,  4,  6,  5, 11,  8,  7, 18, 16,  9, 12, 10, 13, 15, 14,
       19, 38, 22, 48, 23, 25, 17, 20, 21, 31, 37, 28, 27, 32, 26, 30, 46,
       42, 45, 24, 62, 44, 43, 29, 52], dtype=int64)

## <span style='background :yellow' > Variable NUMBER_ITEMS_REDEEMED </span>


In [22]:
df_reward.NUMBER_ITEMS_REDEEMED.unique()

array([  2,   1,   8,  12,  10,   6,  13,   4,   3,  81,  38,  11,  32,
        17,   7,   5,  26,  19,  18,  20,   9,  23,  16,  14,  22,  15,
        25,  67,  24,  21, 175,  27,  35,  29,  30,  33,  50,  45,  37,
        40,  48,  55,  58,  31,  56,  34, 132,  43,  44,  42,  51,  28,
       117,  39,  41, 158, 150,  64,  53,  46,  49,  63,  54,  89, 125,
        70,  66,  36,  59,  75,  57,  99,  76,  62,  92,  95,  80,  47,
       818, 102, 214,  97, 109, 114,  77,  94,  65,  74,  86,  83, 107,
        52,  88, 188,  60, 145,  79, 100, 110, 108, 124,  90,  61,  71,
        96,  68,  72, 162, 148, 103,  69, 101,  78, 154,  73,  91, 138,
       151, 122,  93, 218, 120, 167, 111, 205, 155,  85,  82, 166, 208,
       178, 320, 324, 253, 139, 340, 157, 292, 236, 163, 210, 186, 123,
       240, 386, 265, 246, 420, 232, 245, 247, 281, 171, 277, 190, 180,
       254, 297, 116, 177,  84, 112, 104, 134, 212,  87, 152, 130, 164,
       197, 165,  98, 156, 191, 133, 140, 274, 160, 161, 105, 12

## <span style='background :yellow' > Variable POINTS_REDEEMED </span>


In [23]:
df_reward.POINTS_REDEEMED.min()

0

In [24]:
df_reward.POINTS_REDEEMED.max()

107992

In [25]:
#rédemption mais pas de points associés
df_reward[(df_reward['NUMBER_ITEMS_REDEEMED']==0) | (df_reward['POINTS_REDEEMED']==0)]

Unnamed: 0,MEMBER_ID,MONTH_ID,REDEMPTIONS,NUMBER_ITEMS_REDEEMED,POINTS_REDEEMED
678007,1016014448,2020-01,1,2,0


In [26]:
idx_to_remove = df_reward[(df_reward['NUMBER_ITEMS_REDEEMED']==0) | (df_reward['POINTS_REDEEMED']==0)].index

In [27]:
df_reward.drop(idx_to_remove, inplace=True)

#  Creation de Variables 


## <span style='background :yellow' > Variable PTS_RDMED_PCT_CHANGE_2021 </span>

Points Redeemed Change 2020/2021



In [28]:
#on cherhce le Year
df_reward['Year'] = df_reward['MONTH_ID'].apply(lambda x : int(x.year))

In [29]:
df_reward.head()

Unnamed: 0,MEMBER_ID,MONTH_ID,REDEMPTIONS,NUMBER_ITEMS_REDEEMED,POINTS_REDEEMED,Year
0,5885640,2020-02,1,2,200,2020
1,5885640,2019-04,1,1,100,2019
2,5885640,2020-07,1,1,100,2020
3,5889000,2020-09,1,1,100,2020
4,5889000,2021-03,1,1,100,2021


In [30]:
#points par MEMBER_ID et Year
df_reward_pct = df_reward.groupby(['MEMBER_ID', 'Year'], as_index=False)[['POINTS_REDEEMED']].sum()

In [31]:
df_reward_pct.head()

Unnamed: 0,MEMBER_ID,Year,POINTS_REDEEMED
0,170,2021,835
1,184,2019,1800
2,184,2020,1500
3,184,2021,700
4,186,2020,2251


In [32]:
#Variation en pourcentage
df_reward_pct['POINTS_REDEEMED_PCT_CHANGE'] = df_reward_pct.groupby('MEMBER_ID')['POINTS_REDEEMED'].pct_change().fillna(0)

In [33]:
df_reward_pct.head(10)

Unnamed: 0,MEMBER_ID,Year,POINTS_REDEEMED,POINTS_REDEEMED_PCT_CHANGE
0,170,2021,835,0.0
1,184,2019,1800,0.0
2,184,2020,1500,-0.166667
3,184,2021,700,-0.533333
4,186,2020,2251,0.0
5,186,2021,600,-0.733452
6,255,2019,200,0.0
7,255,2020,400,1.0
8,255,2021,100,-0.75
9,311,2019,100,0.0


In [34]:
df_reward_pct.shape

(776665, 4)

In [35]:
#pivot pour avoir un year par colonne
# fillna(0) pour remplacer les nan des members id qui n'ont eu de pts redeemed pour l'année en question
df_reward_pct_pivot = df_reward_pct.pivot_table(index='MEMBER_ID', 
                                               columns='Year',
                                               values='POINTS_REDEEMED_PCT_CHANGE').fillna(0) # 'POINTS_REDEEMED_PCT_CHANGE'

In [36]:
df_reward_pct_pivot

Year,2019,2020,2021
MEMBER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
170,0.000000,0.000000,0.000000
184,0.000000,-0.166667,-0.533333
186,0.000000,0.000000,-0.733452
255,0.000000,1.000000,-0.750000
311,0.000000,1.000000,0.000000
...,...,...,...
1018321802,0.000000,0.000000,0.000000
1018362128,0.000000,0.000000,0.000000
1018370821,0.000000,0.000000,0.000000
1018371389,0.000000,0.000000,0.000000


In [37]:

df_reward_pct_pivot.columns = ['POINTS_REDEEMED_PCT_CHANGE_'+str(i) for i in df_reward_pct_pivot.columns]

In [38]:
df_reward_pct_pivot.reset_index(inplace=True)

In [39]:
df_reward_pct_pivot

Unnamed: 0,MEMBER_ID,POINTS_REDEEMED_PCT_CHANGE_2019,POINTS_REDEEMED_PCT_CHANGE_2020,POINTS_REDEEMED_PCT_CHANGE_2021
0,170,0.000000,0.000000,0.000000
1,184,0.000000,-0.166667,-0.533333
2,186,0.000000,0.000000,-0.733452
3,255,0.000000,1.000000,-0.750000
4,311,0.000000,1.000000,0.000000
...,...,...,...,...
399249,1018321802,0.000000,0.000000,0.000000
399250,1018362128,0.000000,0.000000,0.000000
399251,1018370821,0.000000,0.000000,0.000000
399252,1018371389,0.000000,0.000000,0.000000


In [40]:
#MEMBER_ID et POINTS_REDEEMED_PCT_CHANGE_2021
df_reward_pct_pivot = df_reward_pct_pivot.loc[:, ['MEMBER_ID', 'POINTS_REDEEMED_PCT_CHANGE_2021']]

# agregation df_reward

In [41]:
df_reward_trimmed = df_reward.loc[:,['MEMBER_ID', 'REDEMPTIONS', 'NUMBER_ITEMS_REDEEMED']].groupby('MEMBER_ID', as_index=False).sum()

In [42]:
df_reward_trimmed

Unnamed: 0,MEMBER_ID,REDEMPTIONS,NUMBER_ITEMS_REDEEMED
0,170,1,1
1,184,27,40
2,186,9,13
3,255,4,7
4,311,3,3
...,...,...,...
399249,1018321802,1,1
399250,1018362128,1,1
399251,1018370821,1,1
399252,1018371389,1,1


In [43]:
assert df_reward_trimmed.shape[0] == df_reward_pct_pivot.shape[0] # == df_rewards_category_pivot.shape[0]

# Jointure

In [44]:
df_to_be_merged = [df_reward_trimmed, df_reward_pct_pivot]

In [45]:
df_reward_merged = reduce(lambda left,right: pd.merge(left,right,on=['MEMBER_ID'],
                                            how='inner'), df_to_be_merged)

In [46]:
df_reward_merged

Unnamed: 0,MEMBER_ID,REDEMPTIONS,NUMBER_ITEMS_REDEEMED,POINTS_REDEEMED_PCT_CHANGE_2021
0,170,1,1,0.000000
1,184,27,40,-0.533333
2,186,9,13,-0.733452
3,255,4,7,-0.750000
4,311,3,3,0.000000
...,...,...,...,...
399249,1018321802,1,1,0.000000
399250,1018362128,1,1,0.000000
399251,1018370821,1,1,0.000000
399252,1018371389,1,1,0.000000


In [47]:
df_reward_merged.shape

(399254, 4)

In [48]:
output_to_csv = True

In [49]:
if output_to_csv:
    df_reward_merged.to_csv('DATA_PRETRAITEMENT/df_reward.csv', index=False)