# Data Processing: Missing Data and Class Imbalance

_Abdurrahman Dilmac, Ugur Ali Kaplan_

12nd February 2022

In [1]:
from copy import deepcopy
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

You can download the dataset here: [https://www.kaggle.com/jackdaoud/marketing-data](https://www.kaggle.com/jackdaoud/marketing-data)

In [2]:
data = pd.read_csv('marketing_data.csv')
y = np.array(data['Response']).astype(np.float16)

x = data.drop(['Response'], axis=1)
x = x.drop(['ID'], axis=1)

In [3]:
x.shape

(2240, 26)

Let's view the number of missing values.

In [4]:
x.isna().sum()

Year_Birth              0
Education               0
Marital_Status          0
 Income                24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Country                 0
dtype: int64

In [5]:
def remove_value(x):
    # Randomly remove values from the data
    a, b = x.shape
    n_missing = int(a * b * 0.1)
    print(f'Removing {n_missing} values randomly.')
    for i in range(n_missing):
        coor = (np.random.rand(2) * [*x.shape]).astype(np.int16)
        coor = np.clip(coor, a_min=0, a_max=max(x.shape) - 1)
        x.iloc[coor[0], coor[1]] = np.nan
    return x

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
x_train_orig = deepcopy(x_train)
x_test_orig = deepcopy(x_test)

We remove some of the values randomly.

In [8]:
x_train = remove_value(x_train)
x_test = remove_value(x_test)

Removing 4659 values randomly.
Removing 1164 values randomly.


Now we should have more missing values.

In [9]:
x_train.isna().sum()

Year_Birth             191
Education              175
Marital_Status         165
 Income                196
Kidhome                169
Teenhome               171
Dt_Customer            172
Recency                164
MntWines               172
MntFruits              171
MntMeatProducts        157
MntFishProducts        164
MntSweetProducts       167
MntGoldProds           163
NumDealsPurchases      158
NumWebPurchases        174
NumCatalogPurchases    165
NumStorePurchases      190
NumWebVisitsMonth      185
AcceptedCmp3           169
AcceptedCmp4           168
AcceptedCmp5           169
AcceptedCmp1           152
AcceptedCmp2           179
Complain               182
Country                154
dtype: int64

In [10]:
x_test.isna().sum()

Year_Birth             44
Education              45
Marital_Status         45
 Income                48
Kidhome                48
Teenhome               48
Dt_Customer            38
Recency                47
MntWines               43
MntFruits              34
MntMeatProducts        36
MntFishProducts        39
MntSweetProducts       40
MntGoldProds           45
NumDealsPurchases      43
NumWebPurchases        34
NumCatalogPurchases    49
NumStorePurchases      38
NumWebVisitsMonth      51
AcceptedCmp3           42
AcceptedCmp4           50
AcceptedCmp5           40
AcceptedCmp1           38
AcceptedCmp2           38
Complain               56
Country                38
dtype: int64

We summarize the dataset.

In [11]:
pos_res = y_train[y_train == 1].shape[0]
neg_res = y_train[y_train == 0].shape[0]
print(f'Number of positive responses: {pos_res}')
print(f'Number of negative responses: {neg_res}')
print(f'Accuracy if the model always predicts positive: {100 * pos_res/(pos_res+neg_res)}%')
print(f'Accuracy if the model always predicts negative: {100 * neg_res/(pos_res+neg_res)}%')

Number of positive responses: 267
Number of negative responses: 1525
Accuracy if the model always predicts positive: 14.899553571428571%
Accuracy if the model always predicts negative: 85.10044642857143%


In [12]:
x_train.isna().sum()

Year_Birth             191
Education              175
Marital_Status         165
 Income                196
Kidhome                169
Teenhome               171
Dt_Customer            172
Recency                164
MntWines               172
MntFruits              171
MntMeatProducts        157
MntFishProducts        164
MntSweetProducts       167
MntGoldProds           163
NumDealsPurchases      158
NumWebPurchases        174
NumCatalogPurchases    165
NumStorePurchases      190
NumWebVisitsMonth      185
AcceptedCmp3           169
AcceptedCmp4           168
AcceptedCmp5           169
AcceptedCmp1           152
AcceptedCmp2           179
Complain               182
Country                154
dtype: int64

In [13]:
x_train.dtypes

Year_Birth             float64
Education               object
Marital_Status          object
 Income                 object
Kidhome                float64
Teenhome               float64
Dt_Customer             object
Recency                float64
MntWines               float64
MntFruits              float64
MntMeatProducts        float64
MntFishProducts        float64
MntSweetProducts       float64
MntGoldProds           float64
NumDealsPurchases      float64
NumWebPurchases        float64
NumCatalogPurchases    float64
NumStorePurchases      float64
NumWebVisitsMonth      float64
AcceptedCmp3           float64
AcceptedCmp4           float64
AcceptedCmp5           float64
AcceptedCmp1           float64
AcceptedCmp2           float64
Complain               float64
Country                 object
dtype: object

In [14]:
x_train.columns[x_train.dtypes == object]

Index(['Education', 'Marital_Status', ' Income ', 'Dt_Customer', 'Country'], dtype='object')

In [15]:
x_train[x_train.columns[x_train.dtypes == object]].nunique()

Education            5
Marital_Status       8
 Income           1466
Dt_Customer        613
Country              8
dtype: int64

We further structure non-number features.

In [16]:
def ordinal_encode(X, cats):
    X = deepcopy(X)
    for cat in cats:
        X[cat][X[cat].isna()] = 'nan'
        
    enc=OrdinalEncoder()
    enc.fit(X[cats])
    
    transformed = enc.transform(X[cats])
    
    for i in range(len(cats)):
        nan_pos = np.where(enc.categories_[i] == 'nan')
        transformed[[np.where(transformed[:, i] == nan_pos)], i] = np.nan
        
    return transformed, enc

In [None]:
cats = ['Education', 'Marital_Status', 'Country']
a, b = ordinal_encode(x_train, cats)
for i, cat in enumerate(cats):
    x_train[cat] = a[:, i]

In [None]:
for cat in cats:
    x_test[cat][x_test[cat].isna()] = 'nan'
c = b.transform(x_test[cats])

for i in range(len(cats)):
    nan_pos = np.where(b.categories_[i] == 'nan')
    c[[np.where(c[:, i] == nan_pos)], i] = np.nan

for i, cat in enumerate(cats):
    x_test[cat] = c[:, i]

In [None]:
for cat in cats:
    x_train_orig[cat][x_train_orig[cat].isna()] = 'nan'
d = b.transform(x_train_orig[cats])

for i in range(len(cats)):
    nan_pos = np.where(b.categories_[i] == 'nan')
    d[[np.where(d[:, i] == nan_pos)], i] = np.nan

for i, cat in enumerate(cats):
    x_train_orig[cat] = d[:, i]

In [20]:
x_train

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Country
1293,1949.0,,,"$80,360.00",,0.0,3/3/13,56.0,1493.0,86.0,...,4.0,5.0,,0.0,1.0,1.0,1.0,0.0,0.0,
279,1975.0,,4.0,"$33,249.00",1.0,,,11.0,6.0,10.0,...,1.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,,0.0
2158,1975.0,2.0,4.0,,0.0,1.0,4/29/14,96.0,143.0,6.0,...,1.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
569,,4.0,5.0,"$46,015.00",1.0,1.0,4/13/14,25.0,38.0,0.0,...,,3.0,7.0,1.0,0.0,0.0,,1.0,0.0,5.0
806,1969.0,3.0,5.0,"$44,602.00",1.0,1.0,2/15/14,35.0,167.0,2.0,...,1.0,4.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
741,1958.0,2.0,4.0,"$68,281.00",0.0,,8/7/12,31.0,995.0,112.0,...,9.0,13.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
1275,1984.0,,,"$73,356.00",0.0,0.0,2/6/14,56.0,1050.0,14.0,...,11.0,,2.0,1.0,0.0,1.0,1.0,0.0,0.0,6.0
87,1973.0,2.0,3.0,"$24,639.00",1.0,,1/28/14,3.0,20.0,3.0,...,,4.0,6.0,0.0,0.0,,,,0.0,0.0
181,1955.0,2.0,5.0,"$42,586.00",1.0,1.0,10/29/12,7.0,194.0,2.0,...,1.0,6.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [21]:
x_test

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Country
1303,,,,"$76,140.00",0.0,0.0,5/13/14,57.0,586.0,66.0,...,9.0,6.0,2.0,0.0,,0.0,0.0,0.0,0.0,
224,1971.0,3.0,4.0,"$39,763.00",1.0,0.0,8/4/13,,80.0,1.0,...,1.0,2.0,9.0,,0.0,,0.0,0.0,0.0,6.0
1155,1962.0,4.0,5.0,,0.0,1.0,7/20/13,51.0,517.0,12.0,...,4.0,7.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
355,1983.0,3.0,4.0,"$31,788.00",1.0,0.0,3/20/14,,16.0,7.0,...,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0
2122,1961.0,3.0,3.0,,0.0,,7/26/13,94.0,92.0,4.0,...,,4.0,3.0,0.0,0.0,0.0,0.0,0.0,,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1511,1984.0,2.0,3.0,"$72,570.00",,,4/25/14,67.0,274.0,83.0,...,6.0,12.0,1.0,0.0,0.0,0.0,,0.0,,6.0
1397,1979.0,2.0,3.0,"$7,500.00",0.0,1.0,8/29/12,,,2.0,...,0.0,2.0,8.0,0.0,0.0,0.0,,0.0,0.0,6.0
1251,1960.0,2.0,,"$29,315.00",1.0,1.0,4/6/14,,13.0,2.0,...,0.0,4.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
95,1962.0,3.0,3.0,"$58,646.00",0.0,1.0,6/10/13,3.0,62.0,1.0,...,1.0,4.0,,0.0,,0.0,0.0,0.0,0.0,6.0


In [22]:
# x_train = pd.get_dummies(x_train, columns=['Education', 'Marital_Status', 'Country'], dummy_na=True)
# x_test = pd.get_dummies(x_test, columns=['Education', 'Marital_Status', 'Country'], dummy_na=True)

In [23]:
# x_train_orig = pd.get_dummies(x_train_orig, columns=['Education', 'Marital_Status', 'Country'], dummy_na=True)

In [None]:
incomes = [float(inc.lstrip('$').replace(',', '')) for inc in x_train[' Income '] if isinstance(inc, str)]
x_train[' Income '][x_train[' Income '].apply(lambda x: isinstance(x, str))] = incomes
x_train[' Income '] = x_train[' Income '].astype(np.float64)

In [None]:
incomes = [float(inc.lstrip('$').replace(',', '')) for inc in x_train_orig[' Income '] if isinstance(inc, str)]
x_train_orig[' Income '][x_train_orig[' Income '].apply(lambda x: isinstance(x, str))] = incomes
x_train_orig[' Income '] = x_train_orig[' Income '].astype(np.float64)

In [26]:
x_train

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Country
1293,1949.0,,,80360.0,,0.0,3/3/13,56.0,1493.0,86.0,...,4.0,5.0,,0.0,1.0,1.0,1.0,0.0,0.0,
279,1975.0,,4.0,33249.0,1.0,,,11.0,6.0,10.0,...,1.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,,0.0
2158,1975.0,2.0,4.0,,0.0,1.0,4/29/14,96.0,143.0,6.0,...,1.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
569,,4.0,5.0,46015.0,1.0,1.0,4/13/14,25.0,38.0,0.0,...,,3.0,7.0,1.0,0.0,0.0,,1.0,0.0,5.0
806,1969.0,3.0,5.0,44602.0,1.0,1.0,2/15/14,35.0,167.0,2.0,...,1.0,4.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
741,1958.0,2.0,4.0,68281.0,0.0,,8/7/12,31.0,995.0,112.0,...,9.0,13.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
1275,1984.0,,,73356.0,0.0,0.0,2/6/14,56.0,1050.0,14.0,...,11.0,,2.0,1.0,0.0,1.0,1.0,0.0,0.0,6.0
87,1973.0,2.0,3.0,24639.0,1.0,,1/28/14,3.0,20.0,3.0,...,,4.0,6.0,0.0,0.0,,,,0.0,0.0
181,1955.0,2.0,5.0,42586.0,1.0,1.0,10/29/12,7.0,194.0,2.0,...,1.0,6.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [None]:
incomes = [float(inc.lstrip('$').replace(',', '')) for inc in x_test[' Income '] if isinstance(inc, str)]
x_test[' Income '][x_test[' Income '].apply(lambda x: isinstance(x, str))] = incomes
x_test[' Income '] = x_test[' Income '].astype(np.float64)

In [28]:
x_train['Dt_Customer'] = (datetime.today() - pd.to_datetime(x_train['Dt_Customer'])).dt.days
x_test['Dt_Customer'] = (datetime.today() - pd.to_datetime(x_test['Dt_Customer'])).dt.days
x_train_orig['Dt_Customer'] = (datetime.today() - pd.to_datetime(x_train_orig['Dt_Customer'])).dt.days

In [29]:
x_train.insert(2, 'Age', [2021 - birth for birth in x_train['Year_Birth']])
x_test.insert(2, 'Age', [2021 - birth for birth in x_test['Year_Birth']])
x_train_orig.insert(2, 'Age', [2021 - birth for birth in x_train_orig['Year_Birth']])

In [30]:
x_train.dtypes

Year_Birth             float64
Education              float64
Age                    float64
Marital_Status         float64
 Income                float64
Kidhome                float64
Teenhome               float64
Dt_Customer            float64
Recency                float64
MntWines               float64
MntFruits              float64
MntMeatProducts        float64
MntFishProducts        float64
MntSweetProducts       float64
MntGoldProds           float64
NumDealsPurchases      float64
NumWebPurchases        float64
NumCatalogPurchases    float64
NumStorePurchases      float64
NumWebVisitsMonth      float64
AcceptedCmp3           float64
AcceptedCmp4           float64
AcceptedCmp5           float64
AcceptedCmp1           float64
AcceptedCmp2           float64
Complain               float64
Country                float64
dtype: object

## Correlation

In [31]:
corr_mat = x_train.corr()

In [32]:
x_train.corrwith(pd.DataFrame(y_train)[0])

Year_Birth            -0.035404
Education              0.013791
Age                    0.035404
Marital_Status        -0.015584
 Income                0.002959
Kidhome               -0.050882
Teenhome              -0.007032
Dt_Customer            0.070882
Recency               -0.017832
MntWines              -0.008323
MntFruits             -0.012808
MntMeatProducts        0.016435
MntFishProducts       -0.001491
MntSweetProducts       0.026261
MntGoldProds           0.020836
NumDealsPurchases      0.057905
NumWebPurchases        0.012277
NumCatalogPurchases    0.010451
NumStorePurchases      0.043155
NumWebVisitsMonth      0.036328
AcceptedCmp3           0.008817
AcceptedCmp4          -0.011613
AcceptedCmp5          -0.032335
AcceptedCmp1          -0.041008
AcceptedCmp2          -0.026553
Complain              -0.039339
Country               -0.024016
dtype: float64

In [33]:
corr_mat.to_csv('corr_mat.csv')

In [34]:
corr_mat

Unnamed: 0,Year_Birth,Education,Age,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Country
Year_Birth,1.0,-0.176863,-1.0,-0.051281,-0.12673,0.215013,-0.360391,0.031629,-0.007086,-0.162658,...,-0.106146,-0.108172,0.093504,0.043495,-0.046606,0.042106,-0.012246,-0.000179,-0.037502,0.005047
Education,-0.176863,1.0,0.176863,0.034848,0.115282,-0.060492,0.108965,-0.053744,-0.030358,0.211177,...,0.073783,0.076226,-0.024908,0.014636,0.047811,0.034282,-0.029352,0.02372,-0.04226,0.028265
Age,-1.0,0.176863,1.0,0.051281,0.12673,-0.215013,0.360391,-0.031629,0.007086,0.162658,...,0.106146,0.108172,-0.093504,-0.043495,0.046606,-0.042106,0.012246,0.000179,0.037502,-0.005047
Marital_Status,-0.051281,0.034848,0.051281,1.0,0.02535,-0.010457,-0.003076,-0.010124,0.013465,-0.004596,...,0.00286,-0.007261,-0.022908,-0.031109,0.018303,0.023801,-0.02401,0.003478,0.011888,0.044105
Income,-0.12673,0.115282,0.12673,0.02535,1.0,-0.417667,0.010813,-0.006122,0.006411,0.530603,...,0.557657,0.479938,-0.511633,-0.002759,0.176028,0.316617,0.317158,0.077148,-0.013661,0.005364
Kidhome,0.215013,-0.060492,-0.215013,-0.010457,-0.417667,1.0,-0.044506,-0.07937,-0.009449,-0.5116,...,-0.513628,-0.501375,0.45113,0.015781,-0.153232,-0.2126,-0.170369,-0.066651,0.016543,-0.012844
Teenhome,-0.360391,0.108965,0.360391,-0.003076,0.010813,-0.044506,1.0,0.007664,-0.012721,0.009222,...,-0.119585,0.058296,0.134151,-0.043249,0.040015,-0.194647,-0.138322,-0.00961,0.032855,-0.019975
Dt_Customer,0.031629,-0.053744,-0.031629,-0.010124,-0.006122,-0.07937,0.007664,1.0,0.023127,0.190167,...,0.127798,0.119991,0.263739,0.020001,0.010071,-0.004074,-0.057585,-0.025132,0.016742,-0.002021
Recency,-0.007086,-0.030358,0.007086,0.013465,0.006411,-0.009449,-0.012721,0.023127,1.0,0.03204,...,0.0442,0.023343,-0.044053,-0.034509,0.010391,0.026529,-0.002873,-0.001235,0.024344,0.057321
MntWines,-0.162658,0.211177,0.162658,-0.004596,0.530603,-0.5116,0.009222,0.190167,0.03204,1.0,...,0.650137,0.632055,-0.310299,0.073206,0.360778,0.470293,0.358638,0.174517,-0.033626,0.033637


In [35]:
corr_mat['Age'][np.abs(corr_mat['Age']) > 0.5]

Year_Birth   -1.0
Age           1.0
Name: Age, dtype: float64

## Scaling

In general, models behave heavily biased on unscaled datasets. Therefore, it is better to scale the data.

In [37]:
sc = StandardScaler()
x_train_orig = pd.DataFrame(sc.fit_transform(x_train_orig), columns=x_train.columns)
x_train = pd.DataFrame(sc.fit_transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(sc.transform(x_test), columns=x_test.columns)

## Filling Missing Data

Mainly, there are two types of techniques:

- Univariate methods: By considering only that specific feature.
  - Mean
  - Median
  - Mod
  - Fix value
- Multivariate methods: By considering not only one feature but also a mix of other features
  - Closest Neighbor
  - Train a model for the missing data

We will first demonstrate univariate methods.

### Univariate Methods: Mean, Median, Mod, Fixed Value

In [38]:
from sklearn.impute import SimpleImputer

In [39]:
imputers = []

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imputers.append(imp_mean)

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
imputers.append(imp_median)

imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputers.append(imp_mode)

imp_constant = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
imputers.append(imp_constant)

In [40]:
for imp in imputers:
    imp.fit(x_train)

In [41]:
x_train.mode().iloc[0]

Year_Birth             0.590034
Education             -0.361766
Age                   -0.755965
Marital_Status        -0.687968
 Income               -1.688133
Kidhome               -0.827691
Teenhome              -0.920246
Dt_Customer            1.543991
Recency                0.007051
MntWines              -0.889135
MntFruits             -0.659697
MntMeatProducts       -0.719915
MntFishProducts       -0.688618
MntSweetProducts      -0.657474
MntGoldProds          -0.832611
NumDealsPurchases     -0.695051
NumWebPurchases       -0.745952
NumCatalogPurchases   -0.919534
NumStorePurchases     -0.840549
NumWebVisitsMonth      0.682150
AcceptedCmp3          -0.291364
AcceptedCmp4          -0.281193
AcceptedCmp5          -0.288868
AcceptedCmp1          -0.261541
AcceptedCmp2          -0.114852
Complain              -0.096976
Country                0.714810
Name: 0, dtype: float64

In [42]:
print(x_train.mean())

Year_Birth             5.270265e-18
Education              1.116406e-16
Age                   -4.992883e-18
Marital_Status        -1.106126e-17
 Income                7.443225e-18
Kidhome               -3.066623e-16
Teenhome              -5.924386e-17
Dt_Customer            3.077100e-17
Recency               -3.587084e-17
MntWines              -1.096517e-17
MntFruits              4.314870e-18
MntMeatProducts       -3.205048e-17
MntFishProducts       -2.264091e-17
MntSweetProducts      -3.142785e-18
MntGoldProds          -7.053903e-18
NumDealsPurchases      4.396048e-17
NumWebPurchases       -2.466095e-16
NumCatalogPurchases   -1.080540e-16
NumStorePurchases      1.268232e-17
NumWebVisitsMonth     -1.293303e-16
AcceptedCmp3           1.069864e-16
AcceptedCmp4          -1.434950e-16
AcceptedCmp5           1.452251e-16
AcceptedCmp1           5.569393e-16
AcceptedCmp2          -1.314389e-15
Complain              -3.420314e-16
Country                5.219472e-16
dtype: float64


In [43]:
print(x_train.median())

Year_Birth             0.092241
Education             -0.361766
Age                   -0.092241
Marital_Status         0.238822
 Income               -0.033680
Kidhome               -0.827691
Teenhome              -0.920246
Dt_Customer            0.016422
Recency                0.007051
MntWines              -0.381358
MntFruits             -0.457892
MntMeatProducts       -0.443669
MntFishProducts       -0.466338
MntSweetProducts      -0.461389
MntGoldProds          -0.375552
NumDealsPurchases     -0.162353
NumWebPurchases       -0.029236
NumCatalogPurchases   -0.229459
NumStorePurchases     -0.217718
NumWebVisitsMonth      0.269263
AcceptedCmp3          -0.291364
AcceptedCmp4          -0.281193
AcceptedCmp5          -0.288868
AcceptedCmp1          -0.261541
AcceptedCmp2          -0.114852
Complain              -0.096976
Country                0.714810
dtype: float64


In [44]:
x_mean_train = pd.DataFrame(imp_mean.transform(x_train), columns=x_train.columns)
x_median_train = pd.DataFrame(imp_median.transform(x_train), columns=x_train.columns)
x_mode_train = pd.DataFrame(imp_mode.transform(x_train), columns=x_train.columns)
x_const_train = pd.DataFrame(imp_mode.transform(x_train), columns=x_train.columns)

### Multivariate Methods

We will demonstrate K-Nearest Neighbor.

#### K-Nearest Neighbor

In [45]:
from sklearn.impute import KNNImputer

In [46]:
imp_knn = KNNImputer(missing_values=np.nan, n_neighbors=5, weights='distance')

In [47]:
imp_knn.fit(x_train)

KNNImputer(weights='distance')

In [48]:
x_knn_train = pd.DataFrame(imp_knn.transform(x_train), columns=x_train.columns)

In [49]:
x_knn_train

Unnamed: 0,Year_Birth,Education,Age,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Country
0,-1.650033,-0.361766,1.650033,0.204136,1.044690,-0.413081,-0.920246,0.631417,0.248360,3.453241,...,0.460615,-0.217718,-0.678301,-0.291364,3.556270,3.461791,3.823486,-0.114852,-0.096976,-0.369705
1,0.507069,-0.923502,-0.507069,0.238822,-0.722343,1.020098,-0.354680,0.376756,-1.302912,-0.886217,...,-0.574497,-0.840549,0.269263,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,-2.007510
2,0.507069,-0.361766,-0.507069,0.238822,0.080891,-0.827691,0.930522,-1.461551,1.627269,-0.486415,...,-0.574497,-0.217718,-0.143624,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,1.168530
3,0.233261,1.388575,-0.233261,1.165613,-0.243518,1.020098,0.930522,-1.382197,-0.820294,-0.792833,...,-0.860511,-0.840549,0.682150,3.432132,-0.281193,-0.288868,-0.261541,8.706866,-0.096976,0.261090
4,0.009276,0.513405,-0.009276,1.165613,-0.296516,1.020098,0.930522,-1.099497,-0.475567,-0.416377,...,-0.574497,-0.529134,1.095038,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,-0.646350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787,-0.903344,-0.361766,0.903344,0.238822,0.591632,-0.827691,-0.920246,1.663023,-0.613458,1.999945,...,2.185802,2.273602,-0.143624,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.714810
1788,1.253758,0.040389,-1.253758,0.172485,0.781985,-0.827691,-0.920246,-1.054861,0.248360,2.160450,...,2.875877,1.609768,-1.382286,3.432132,-0.281193,3.461791,3.823486,-0.114852,-0.096976,0.714810
1789,0.341138,-0.361766,-0.341138,-0.687968,-1.045286,1.020098,-0.248973,-1.010224,-1.578694,-0.845361,...,-0.840238,-0.529134,0.269263,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,-2.007510
1790,-1.152241,-0.361766,1.152241,1.165613,-0.372132,1.020098,0.930522,1.251373,-1.440803,-0.337584,...,-0.574497,0.093697,1.095038,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.714810


#### Iterative Methods

We can generate missing data using another classifier.

In [50]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor

In [51]:
imp_linear = IterativeImputer(max_iter=100, random_state=0, estimator=BayesianRidge())
imp_tree = IterativeImputer(max_iter=100, random_state=0, estimator=DecisionTreeRegressor(max_features='sqrt', random_state=0))
# imp_forest = IterativeImputer(max_iter = 25, random_state=0, estimator=RandomForestRegressor(n_estimators=20, random_state=0))

In [52]:
imp_linear = IterativeImputer(max_iter=100, random_state=0, estimator=BayesianRidge())

In [53]:
imp_linear.fit(x_train)

IterativeImputer(estimator=BayesianRidge(), max_iter=100, random_state=0)

In [None]:
imp_tree.fit(x_train)

In [55]:
# imp_forest.fit(x_train)

In [56]:
x_linear_train = pd.DataFrame(imp_linear.transform(x_train), columns=x_train.columns)
x_tree_train = pd.DataFrame(imp_tree.transform(x_train), columns=x_train.columns)
# x_forest_train = pd.DataFrame(imp_forest.transform(x_train), columns=x_train.columns)

### Comparison

In [57]:
print('Metric: Mean Absolute Error (MAE)\n')
print(f'Imputing by Mean: {((np.abs(x_train_orig - x_mean_train))).mean().mean()}')
print(f'Imputing by Median: {((np.abs(x_train_orig - x_median_train))).mean().mean()}')
print(f'Imputing by Mode: {((np.abs(x_train_orig - x_mode_train))).mean().mean()}')
print(f'Imputing by Fixed Value: {((np.abs(x_train_orig - x_const_train))).mean().mean()}')
print(f'Imputing by K-Nearest Neighbour: {((np.abs(x_train_orig - x_knn_train))).mean().mean()}')
print(f'Imputing by Linear Reggression Regresyona Göre: {((np.abs(x_train_orig - x_linear_train))).mean().mean()}')
print(f'Imputing by Decision TreeKarar Ağacına Göre: {((np.abs(x_train_orig - x_tree_train))).mean().mean()}')

Metric: Mean Absolute Error (MAE)

Imputing by Mean: 0.07549410850711553
Imputing by Median: 0.06745018733519897
Imputing by Mode: 0.07851293293728567
Imputing by Fixed Value: 0.07851293293728567
Imputing by K-Nearest Neighbour: 0.05407783436426389
Imputing by Linear Reggression Regresyona Göre: 0.05832336308242825
Imputing by Decision TreeKarar Ağacına Göre: 0.060668118041679625


### Transforming Test Set

In [58]:
x_test = pd.DataFrame(imp_knn.transform(x_test), columns=x_test.columns)

In [59]:
x_test

Unnamed: 0,Year_Birth,Education,Age,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Country
0,-0.785443,-0.262449,0.785443,0.549109,0.886406,-0.827691,-0.920246,-1.530986,0.282833,0.806376,...,2.185802,0.093697,-1.382286,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,-0.076325
1,0.175207,0.513405,-0.175207,0.238822,-0.478017,1.020098,-0.920246,-0.132367,-0.321491,-0.670266,...,-0.574497,-1.151964,1.507925,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.714810
2,-0.571482,1.388575,0.571482,1.165613,0.101047,-0.827691,0.930522,-0.057973,0.075997,0.605015,...,0.460615,0.405112,-0.143624,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.261090
3,1.170792,0.513405,-1.170792,0.238822,-0.777142,1.020098,-0.920246,-1.263166,-0.709274,-0.857035,...,-0.919534,-0.529134,-0.143624,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,10.311806,0.261090
4,-0.654448,0.513405,0.654448,-0.687968,-0.199337,-0.827691,0.270171,-0.087731,1.558324,-0.635247,...,-0.678006,-0.529134,-0.969399,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,1.168530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,1.253758,-0.361766,-1.253758,-0.687968,0.752503,-0.827691,-0.498232,-1.441713,0.627560,-0.104123,...,1.150690,1.962187,-1.795173,-0.291364,-0.281193,-0.288868,0.513489,-0.114852,-0.096976,0.714810
444,0.838931,-0.361766,-0.838931,-0.687968,-1.688133,-0.827691,0.930522,1.553911,-0.317320,-0.855622,...,-0.919534,-1.151964,1.095038,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.714810
445,-0.737413,-0.361766,0.737413,-1.006738,-0.869899,1.020098,0.930522,-1.347479,0.378021,-0.865789,...,-0.919534,-0.529134,0.269263,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.261090
446,-0.571482,0.513405,0.571482,-0.687968,0.230244,-0.827691,0.930522,0.140413,-1.578694,-0.722794,...,-0.574497,-0.529134,-0.168440,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.714810


In [60]:
# Inverse transform to get the originals
pd.DataFrame(sc.inverse_transform(x_test), columns=x_test.columns)

Unnamed: 0,Year_Birth,Education,Age,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Country
0,-0.785443,-0.262449,0.785443,0.549109,0.886406,-0.827691,-0.920246,-1.530986,0.282833,0.806376,...,2.185802,0.093697,-1.382286,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,-0.076325
1,0.175207,0.513405,-0.175207,0.238822,-0.478017,1.020098,-0.920246,-0.132367,-0.321491,-0.670266,...,-0.574497,-1.151964,1.507925,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.714810
2,-0.571482,1.388575,0.571482,1.165613,0.101047,-0.827691,0.930522,-0.057973,0.075997,0.605015,...,0.460615,0.405112,-0.143624,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.261090
3,1.170792,0.513405,-1.170792,0.238822,-0.777142,1.020098,-0.920246,-1.263166,-0.709274,-0.857035,...,-0.919534,-0.529134,-0.143624,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,10.311806,0.261090
4,-0.654448,0.513405,0.654448,-0.687968,-0.199337,-0.827691,0.270171,-0.087731,1.558324,-0.635247,...,-0.678006,-0.529134,-0.969399,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,1.168530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,1.253758,-0.361766,-1.253758,-0.687968,0.752503,-0.827691,-0.498232,-1.441713,0.627560,-0.104123,...,1.150690,1.962187,-1.795173,-0.291364,-0.281193,-0.288868,0.513489,-0.114852,-0.096976,0.714810
444,0.838931,-0.361766,-0.838931,-0.687968,-1.688133,-0.827691,0.930522,1.553911,-0.317320,-0.855622,...,-0.919534,-1.151964,1.095038,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.714810
445,-0.737413,-0.361766,0.737413,-1.006738,-0.869899,1.020098,0.930522,-1.347479,0.378021,-0.865789,...,-0.919534,-0.529134,0.269263,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.261090
446,-0.571482,0.513405,0.571482,-0.687968,0.230244,-0.827691,0.930522,0.140413,-1.578694,-0.722794,...,-0.574497,-0.529134,-0.168440,-0.291364,-0.281193,-0.288868,-0.261541,-0.114852,-0.096976,0.714810


## Handling Class Imbalance

Class imbalance is having unequal number of samples in each class, especially when most of the data belong to only one class. This can be problematic since our model usually learns to choose the most popular class, since it maximizes accuracy.

Here are some methods to mitigate class imbalance problem:

- Shifting decision boundary: For example, we can shift decision boundary of 0.5 of logistic regression and set it to a higher of lower value according to our distribution.
- Oversampling: Sampling new data from the given dataset. There are a variety of methods for this having their advantages and disadvantages.
- Undersampling: Why not just throw some of the data?
- Generating new data: These are more complex.
  - Generative Adversarial Networks (GANs)
  - Normalizing Flows
  - Variational Autoencoders

### SMOTE

We add a new data point between two data points of the least represented class.

<figure>
    <img src="img/smote.png">
    <figcaption>Visualisation of SMOTE algorithm. <br> (Figure from: <i>https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets</i>)</figcaption>
</figure>

### Tomek

We remove close and different data point pairs in order to make decision boundary clearer.

<figure>
    <img src="img/tomek.png">
    <figcaption>Visualisation of Tomek algorithm. <br> (Figure from: <i>https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets</i>)</figcaption>
</figure>

In [61]:
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [63]:
smote_tomek = SMOTETomek(random_state=0)
smote_enn = SMOTEENN(random_state=0)
ros = RandomOverSampler(random_state=0)

In [64]:
x_resampled_tomek, y_resampled_tomek = smote_tomek.fit_resample(x_linear_train, y_train)
x_resampled_enn, y_resampled_enn = smote_enn.fit_resample(x_linear_train, y_train)
x_oversampled, y_oversampled = ros.fit_resample(x_linear_train, y_train)

In [65]:
clf = [LGBMClassifier(), LGBMClassifier(), LGBMClassifier(), LGBMClassifier()]

clf[0].fit(x_knn_train, y_train)
clf[1].fit(x_oversampled, y_oversampled)
clf[2].fit(x_resampled_tomek, y_resampled_tomek)
clf[3].fit(x_resampled_enn, y_resampled_enn)

print('Base')
print(classification_report(y_test, clf[0].predict(x_test)))

print('Random Oversampling')
print(classification_report(y_test, clf[1].predict(x_test)))

print('SMOTE Tomek')
print(classification_report(y_test, clf[2].predict(x_test)))

print('SMOTE ENN')
print(classification_report(y_test, clf[0].predict(x_test)))

Base
              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93       381
         1.0       0.68      0.31      0.43        67

    accuracy                           0.88       448
   macro avg       0.78      0.64      0.68       448
weighted avg       0.86      0.88      0.85       448

Random Oversampling
              precision    recall  f1-score   support

         0.0       0.90      0.95      0.92       381
         1.0       0.58      0.37      0.45        67

    accuracy                           0.87       448
   macro avg       0.74      0.66      0.69       448
weighted avg       0.85      0.87      0.85       448

SMOTE Tomek
              precision    recall  f1-score   support

         0.0       0.91      0.94      0.92       381
         1.0       0.58      0.45      0.50        67

    accuracy                           0.87       448
   macro avg       0.74      0.70      0.71       448
weighted avg       0.86      0.87    

In [66]:
clf = [LogisticRegression(), LogisticRegression(), LogisticRegression(), LogisticRegression()]

clf[0].fit(x_knn_train, y_train)
clf[1].fit(x_oversampled, y_oversampled)
clf[2].fit(x_resampled_tomek, y_resampled_tomek)
clf[3].fit(x_resampled_enn, y_resampled_enn)

print('Base')
print(classification_report(y_test, clf[0].predict(x_test)))

print('Random Oversampling')
print(classification_report(y_test, clf[1].predict(x_test)))

print('SMOTE Tomek')
print(classification_report(y_test, clf[2].predict(x_test)))

print('SMOTE ENN')
print(classification_report(y_test, clf[0].predict(x_test)))

Base
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92       381
         1.0       0.58      0.21      0.31        67

    accuracy                           0.86       448
   macro avg       0.73      0.59      0.61       448
weighted avg       0.83      0.86      0.83       448

Random Oversampling
              precision    recall  f1-score   support

         0.0       0.92      0.82      0.87       381
         1.0       0.37      0.61      0.46        67

    accuracy                           0.79       448
   macro avg       0.65      0.72      0.67       448
weighted avg       0.84      0.79      0.81       448

SMOTE Tomek
              precision    recall  f1-score   support

         0.0       0.92      0.83      0.87       381
         1.0       0.38      0.60      0.47        67

    accuracy                           0.80       448
   macro avg       0.65      0.71      0.67       448
weighted avg       0.84      0.80    

In [67]:
print('Shifting Decision Boundary')
print(classification_report(y_test,
                            np.array(
                                pd.DataFrame(clf[0].predict_proba(x_test)).applymap(
                                    lambda x: 1 if x>0.4 else 0)[1]
                            )))

Shifting Decision Boundary
              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92       381
         1.0       0.57      0.30      0.39        67

    accuracy                           0.86       448
   macro avg       0.73      0.63      0.66       448
weighted avg       0.84      0.86      0.84       448

