In [10]:
from pandas import read_csv
from scripts.dataset_preprocessing import binning_date_by_period, label_encode_dataframe, drop_outliers

In [11]:
dataframe = read_csv('../data/raw/marketing_campaign.csv')
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

### UNDEBATTABLE ACTIONS
As we've seen with the exploratory anaylsis, some feature are useless because of the unique values, or useless in a model.  
The **Dt_Customer** is interesting to be plot, however in a model the multiple categories are irelevant.  
The others categoricals values will be either LabelEncode or OneHotEncode, we coul LabelEncode it in a first place.  
***

In [12]:
dataframe.drop(columns=['ID', 'Z_Revenue', 'Z_CostContact'], inplace=True)
dataframe['Dt_Customer'] = dataframe['Dt_Customer'].apply(binning_date_by_period, args=('Year',))

In [13]:
categorical_columns = ['Education', 'Marital_Status', 'Dt_Customer']
dataframe = label_encode_dataframe(dataframe, categorical_columns)

### DEBATTABLE ACTIONS  
The remaining clean to do is about : outliers & missing values.  
In our case, we'll create 3 datasets to 3 different handles : 
- Dropping missing values & dropping outliers
- Filling missing values with median & dropping outliers
- Filling missing values with median (with outliers)
***

In [14]:
outliers_columns = ['Year_Birth', 'Income', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumWebVisitsMonth']

In [15]:
#Drop NA & outliers
dopna_outliers_data = dataframe.copy()

dopna_outliers_data.dropna(inplace=True)
dopna_outliers_data = drop_outliers(dopna_outliers_data, outliers_columns, 90)

dopna_outliers_data.to_csv('../data/cleaned/marketing_campaign_dropna_no_outliers.csv')

In [16]:
#Fill NA & drop outliers
fillna_outliers_data = dataframe.copy()

fillna_outliers_data['Income'].fillna(fillna_outliers_data['Income'].median(), inplace=True)
fillna_outliers_data = drop_outliers(fillna_outliers_data, outliers_columns, 90)

fillna_outliers_data.to_csv('../data/cleaned/marketing_campaign_fillna_no_outliers.csv')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fillna_outliers_data['Income'].fillna(fillna_outliers_data['Income'].median(), inplace=True)


In [17]:
#Fill NA
fillna_data = dataframe.copy()

fillna_data['Income'].fillna(fillna_data['Income'].median(), inplace=True)

fillna_data.to_csv('../data/cleaned/marketing_campaign_fillna.csv')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fillna_data['Income'].fillna(fillna_data['Income'].median(), inplace=True)
