### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

### Importing data
And making first overview

In [2]:
raw_data = pd.read_csv('ml_project1_data.csv')
print(raw_data.info())
print(raw_data['Response'].value_counts())
raw_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
count,2240.0,2240.0,2216.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,...,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0
mean,5592.159821,1968.805804,52247.251354,0.444196,0.50625,49.109375,303.935714,26.302232,166.95,37.525446,...,5.316518,0.072768,0.074554,0.072768,0.064286,0.013393,0.009375,3.0,11.0,0.149107
std,3246.662198,11.984069,25173.076661,0.538398,0.544538,28.962453,336.597393,39.773434,225.715373,54.628979,...,2.426645,0.259813,0.262728,0.259813,0.245316,0.114976,0.096391,0.0,0.0,0.356274
min,0.0,1893.0,1730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
25%,2828.25,1959.0,35303.0,0.0,0.0,24.0,23.75,1.0,16.0,3.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
50%,5458.5,1970.0,51381.5,0.0,0.0,49.0,173.5,8.0,67.0,12.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
75%,8427.75,1977.0,68522.0,1.0,1.0,74.0,504.25,33.0,232.0,50.0,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
max,11191.0,1996.0,666666.0,2.0,2.0,99.0,1493.0,199.0,1725.0,259.0,...,20.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,11.0,1.0


### Transforming Categorical Features
There are two categorical features (Education and Marital Status) that must be transformed to feed the model. The choosen encoding was to transform each unique value of each feature in dummy features.

In [3]:
EducationEncoder = OneHotEncoder()
EducationEncoder.fit(np.array(raw_data['Education']).reshape(-1, 1))
education = pd.DataFrame(EducationEncoder
                         .transform(np.array(raw_data['Education'])
                                    .reshape(-1, 1))
                         .toarray().astype('int64'),
                         columns=EducationEncoder.categories_[0])
raw_data = raw_data.join(education)
MaritalStatusEncoder = OneHotEncoder()
MaritalStatusEncoder.fit(np.array(raw_data['Marital_Status']).reshape(-1, 1))
marital_status = pd.DataFrame(MaritalStatusEncoder
                             .transform(np.array(raw_data['Marital_Status'])
                                       .reshape(-1, 1))
                             .toarray().astype('int64'),
                             columns=MaritalStatusEncoder.categories_[0])
raw_data = raw_data.join(marital_status)
print(raw_data.info())
raw_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 42 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,Master,PhD,Absurd,Alone,Divorced,Married,Single,Together,Widow,YOLO
count,2240.0,2240.0,2216.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,...,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0
mean,5592.159821,1968.805804,52247.251354,0.444196,0.50625,49.109375,303.935714,26.302232,166.95,37.525446,...,0.165179,0.216964,0.000893,0.001339,0.103571,0.385714,0.214286,0.258929,0.034375,0.000893
std,3246.662198,11.984069,25173.076661,0.538398,0.544538,28.962453,336.597393,39.773434,225.715373,54.628979,...,0.371425,0.41227,0.029874,0.03658,0.304772,0.486872,0.410418,0.438144,0.182231,0.029874
min,0.0,1893.0,1730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2828.25,1959.0,35303.0,0.0,0.0,24.0,23.75,1.0,16.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5458.5,1970.0,51381.5,0.0,0.0,49.0,173.5,8.0,67.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8427.75,1977.0,68522.0,1.0,1.0,74.0,504.25,33.0,232.0,50.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
max,11191.0,1996.0,666666.0,2.0,2.0,99.0,1493.0,199.0,1725.0,259.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### More Transformation and Cleaning Data
'Dt_Customer' has been transformed into int type to feed the model.
Also, all missing values and 'ID' column were removed, because they don't contribute to the model.

In [4]:
raw_data['Dt_Customer'] = pd.to_datetime(raw_data['Dt_Customer']).astype(np.int64)
raw_data.dropna(inplace=True)
raw_data.drop('ID', axis=1, inplace=True)
print(raw_data.info())
print(raw_data['Response'].value_counts())
raw_data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2216 entries, 0 to 2239
Data columns (total 41 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year_Birth           2216 non-null   int64  
 1   Education            2216 non-null   object 
 2   Marital_Status       2216 non-null   object 
 3   Income               2216 non-null   float64
 4   Kidhome              2216 non-null   int64  
 5   Teenhome             2216 non-null   int64  
 6   Dt_Customer          2216 non-null   int64  
 7   Recency              2216 non-null   int64  
 8   MntWines             2216 non-null   int64  
 9   MntFruits            2216 non-null   int64  
 10  MntMeatProducts      2216 non-null   int64  
 11  MntFishProducts      2216 non-null   int64  
 12  MntSweetProducts     2216 non-null   int64  
 13  MntGoldProds         2216 non-null   int64  
 14  NumDealsPurchases    2216 non-null   int64  
 15  NumWebPurchases      2216 non-null   i

Unnamed: 0,Year_Birth,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,Master,PhD,Absurd,Alone,Divorced,Married,Single,Together,Widow,YOLO
count,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,...,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0
mean,1968.820397,52247.251354,0.441787,0.505415,1.373456e+18,49.012635,305.091606,26.356047,166.995939,37.637635,...,0.164711,0.217058,0.000903,0.001354,0.104693,0.386733,0.212545,0.258574,0.034296,0.000903
std,11.985554,25173.076661,0.536896,0.544181,1.749036e+16,28.948352,337.32792,39.793917,224.283273,54.752082,...,0.371003,0.412335,0.030035,0.036777,0.306227,0.487112,0.409201,0.43795,0.18203,0.030035
min,1893.0,1730.0,0.0,0.0,1.343606e+18,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1959.0,35303.0,0.0,0.0,1.358294e+18,24.0,24.0,2.0,16.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1970.0,51381.5,0.0,0.0,1.373285e+18,49.0,174.5,8.0,68.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1977.0,68522.0,1.0,1.0,1.388448e+18,74.0,505.0,33.0,232.25,50.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
max,1996.0,666666.0,2.0,2.0,1.404e+18,99.0,1493.0,199.0,1725.0,259.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Removing outliers
Yet, as part of data cleaning process, some features with extreme values were identified and corresponding rows were removed.

In [5]:
cols_outliers = ['Year_Birth', 'Income', 'Dt_Customer', 'MntWines',
                 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
                 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases',
                 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
                 'NumWebVisitsMonth']
scaler = StandardScaler()
for col in cols_outliers:
    column = pd.DataFrame(raw_data[col], index=raw_data.index)
    column_std = scaler.fit_transform(column)
    column[np.abs(column_std) >= 3]
    drop_indexes = column[np.abs(column_std) >= 3].index
    raw_data.drop(index=drop_indexes, inplace=True)
print(raw_data.info())
print(raw_data['Response'].value_counts())
raw_data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1838 entries, 1 to 2239
Data columns (total 41 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year_Birth           1838 non-null   int64  
 1   Education            1838 non-null   object 
 2   Marital_Status       1838 non-null   object 
 3   Income               1838 non-null   float64
 4   Kidhome              1838 non-null   int64  
 5   Teenhome             1838 non-null   int64  
 6   Dt_Customer          1838 non-null   int64  
 7   Recency              1838 non-null   int64  
 8   MntWines             1838 non-null   int64  
 9   MntFruits            1838 non-null   int64  
 10  MntMeatProducts      1838 non-null   int64  
 11  MntFishProducts      1838 non-null   int64  
 12  MntSweetProducts     1838 non-null   int64  
 13  MntGoldProds         1838 non-null   int64  
 14  NumDealsPurchases    1838 non-null   int64  
 15  NumWebPurchases      1838 non-null   i

Unnamed: 0,Year_Birth,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,Master,PhD,Absurd,Alone,Divorced,Married,Single,Together,Widow,YOLO
count,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,...,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0,1838.0
mean,1969.068009,47645.047334,0.508705,0.541893,1.374104e+18,48.923286,252.793798,17.602285,112.737214,25.192057,...,0.173014,0.223069,0.0,0.001632,0.102829,0.387378,0.211099,0.26333,0.032644,0.001088
std,11.474867,18888.351296,0.547852,0.543319,1.725666e+16,28.821227,311.159633,27.368474,156.851278,37.992315,...,0.378362,0.416417,0.0,0.040379,0.303818,0.487284,0.4082,0.44056,0.177752,0.032978
min,1940.0,7500.0,0.0,0.0,1.343606e+18,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1960.0,33238.5,0.0,0.0,1.35959e+18,24.0,18.0,1.0,13.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1970.0,46270.5,0.0,1.0,1.374365e+18,49.0,105.0,6.0,44.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1977.0,62510.5,1.0,1.0,1.388858e+18,74.0,398.75,21.0,137.0,29.75,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
max,1996.0,101970.0,2.0,2.0,1.404e+18,99.0,1315.0,142.0,792.0,184.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Oversampling
As target column 'Response' filled as 1, that means "accepted the offer", represents less than 15% of total responses, there are need to use an oversampling technique.

In [6]:
count_response_0, count_response_1 = raw_data['Response'].value_counts()
data_resp_0 = raw_data[raw_data['Response'] == 0]
data_resp_1 = raw_data[raw_data['Response'] == 1]
model_data = pd.concat([data_resp_0, resample(data_resp_1, n_samples=count_response_0)])
print(model_data.info())
model_data.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3244 entries, 1 to 264
Data columns (total 41 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Year_Birth           3244 non-null   int64  
 1   Education            3244 non-null   object 
 2   Marital_Status       3244 non-null   object 
 3   Income               3244 non-null   float64
 4   Kidhome              3244 non-null   int64  
 5   Teenhome             3244 non-null   int64  
 6   Dt_Customer          3244 non-null   int64  
 7   Recency              3244 non-null   int64  
 8   MntWines             3244 non-null   int64  
 9   MntFruits            3244 non-null   int64  
 10  MntMeatProducts      3244 non-null   int64  
 11  MntFishProducts      3244 non-null   int64  
 12  MntSweetProducts     3244 non-null   int64  
 13  MntGoldProds         3244 non-null   int64  
 14  NumDealsPurchases    3244 non-null   int64  
 15  NumWebPurchases      3244 non-null   in

Unnamed: 0,Year_Birth,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,Master,PhD,Absurd,Alone,Divorced,Married,Single,Together,Widow,YOLO
count,3244.0,3244.0,3244.0,3244.0,3244.0,3244.0,3244.0,3244.0,3244.0,3244.0,...,3244.0,3244.0,3244.0,3244.0,3244.0,3244.0,3244.0,3244.0,3244.0,3244.0
mean,1969.272195,49066.187731,0.491677,0.463009,1.369968e+18,41.976264,312.786683,19.343711,143.306104,28.365598,...,0.174784,0.26603,0.0,0.002466,0.117139,0.343403,0.258015,0.237361,0.038224,0.003391
std,11.561105,20422.802329,0.534581,0.532207,1.774962e+16,29.181734,365.629634,27.638897,186.362692,40.729928,...,0.379841,0.441948,0.0,0.049606,0.321636,0.474918,0.43761,0.425531,0.191767,0.058141
min,1940.0,7500.0,0.0,0.0,1.343606e+18,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1960.0,33569.0,0.0,0.0,1.35311e+18,15.0,19.0,2.0,16.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1971.0,46854.0,0.0,0.0,1.36823e+18,38.0,154.0,7.0,57.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1978.0,65220.0,1.0,1.0,1.384387e+18,67.0,517.5,25.0,199.0,37.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
max,1996.0,101970.0,2.0,2.0,1.404e+18,99.0,1315.0,142.0,792.0,184.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Creating Model

In [7]:
X = model_data.select_dtypes(exclude=['object']).drop('Response', axis=1)
y = model_data['Response']
Xtr, Xval, ytr, yval = train_test_split(X, y, test_size=0.5, random_state=0)

In [8]:
forest = RandomForestClassifier(n_estimators=1200, min_samples_leaf=25, random_state=0)
forest.fit(Xtr, ytr)
pred = forest.predict(Xval)
np.sqrt(mean_squared_error(yval, pred))

0.3555097415339333

### Evaluating Model

In [9]:
print(classification_report(yval, pred))
cf_matrix = pd.DataFrame(confusion_matrix(yval, pred, normalize='all'))
cf_matrix

              precision    recall  f1-score   support

           0       0.94      0.81      0.87       834
           1       0.82      0.95      0.88       788

    accuracy                           0.87      1622
   macro avg       0.88      0.88      0.87      1622
weighted avg       0.88      0.87      0.87      1622



Unnamed: 0,0,1
0,0.414303,0.099877
1,0.02651,0.459309


In [10]:
feature_importances = pd.DataFrame(forest.feature_importances_.reshape(-1, 1),
                                    index=forest.feature_names_in_,
                                    columns=['Feature_Importances'])
sorted_feature_importances = feature_importances.sort_values('Feature_Importances',
                                                              ascending=False)
sorted_feature_importances

Unnamed: 0,Feature_Importances
Recency,0.16686
Dt_Customer,0.138507
MntGoldProds,0.069289
MntWines,0.064625
NumCatalogPurchases,0.061535
MntMeatProducts,0.05975
AcceptedCmp3,0.058714
NumWebVisitsMonth,0.056484
Income,0.045732
NumStorePurchases,0.043862


### Final Considerations
The model reached the accuracy of 87%. In terms of precision, it was able to correcly classify 94% of customers that didn't accepted offers and those that accepted with precision of 82%.

Also, it's important for the model to minimize false negatives of offers accepted (i.e. classified as not accepted), because the objective is that the campaign reach as most as possible customers that are similar to those in the prediction model. The recall of accepted offers is 95%, which points to the correct direction.