In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, RFE
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from imblearn.combine import SMOTETomek
from plotnine import ggplot, geom_point, aes, scale_color_cmap_d
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, roc_auc_score

## Read Data

In [2]:
train_data = pd.read_csv("santander-customer-transaction-prediction/train.csv", index_col="ID_code")
#test_data = pd.read_csv("santander-customer-transaction-prediction/test.csv", index_col="ID_code")

In [3]:
train_data = train_data.sample(1000, random_state=0)

In [4]:
train_data.to_csv("santander-customer-transaction-prediction/train_subset.csv", index=True)

In [5]:
#a = pd.read_csv("santander-customer-transaction-prediction/train_subset.csv", index_col="ID_code")

In [6]:
X_tr, X_val, y_tr, y_val = train_test_split(train_data.drop(["target"], axis=1), train_data["target"], test_size=0.1, random_state=0)

In [7]:
X_tr.describe()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0,...,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0,900.0
mean,10.749313,-1.699366,10.809094,6.845585,11.13636,-5.106177,5.412202,16.603393,0.265599,7.625283,...,3.210154,7.40982,1.924958,3.40401,17.881158,-0.072886,2.251376,8.89065,15.962148,-2.95169
std,2.993426,4.138201,2.67677,1.994094,1.630443,7.891318,0.856982,3.387487,3.407973,1.186798,...,4.428082,3.066433,1.403487,3.974253,3.101367,1.469275,5.449241,0.910679,3.054123,10.215189
min,3.1262,-11.3002,4.2884,1.129,6.884,-26.1247,3.0242,8.1933,-8.5994,4.2877,...,-9.3442,-0.3064,-2.843,-7.5681,10.8706,-4.1952,-12.1077,6.3433,7.7002,-32.9761
25%,8.536675,-4.953475,8.76225,5.281,9.88915,-11.168925,4.7882,14.09675,-2.28775,6.761425,...,0.2207,5.04435,0.95935,0.60825,15.46265,-1.077425,-2.0924,8.234375,13.883925,-10.8297
50%,10.586,-1.6127,10.75815,6.86445,11.12695,-4.7331,5.44245,16.53405,0.3887,7.7042,...,3.3491,7.2493,1.9255,3.5375,17.87435,-0.08585,2.14575,8.84675,16.0453,-2.22485
75%,12.832975,1.260175,12.62895,8.3298,12.33945,0.909025,5.990425,18.969325,3.07185,8.5916,...,6.0922,9.598925,2.93755,6.447025,20.24585,0.905825,6.81215,9.576375,18.258275,4.825025
max,19.0016,8.415,17.9277,11.6094,15.2273,14.1386,7.7188,26.2458,8.5218,10.2378,...,16.4687,16.3365,5.7247,16.6146,25.9327,3.3743,14.6042,11.3589,23.4148,25.0944


In [8]:
X_tr.isna().sum().sum()

0

In [9]:
y_tr.describe()

count    900.000000
mean       0.088889
std        0.284742
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: target, dtype: float64

In [10]:
y_tr.unique()

array([0, 1])

In [11]:
(y_tr==1).sum() / len(y_tr)

0.08888888888888889

In [12]:
(y_tr==0).sum() / len(y_tr)

0.9111111111111111

## Imbalance Problem

We have to fix the imbalance problem. 

In [13]:
resampler = SMOTETomek(random_state=0)

In [14]:
X_tr, y_tr = resampler.fit_resample(X_tr, y_tr)

In [15]:
X_tr.to_csv("santander-customer-transaction-prediction/smote_train.csv", index=False)
y_tr.to_csv("santander-customer-transaction-prediction/smote_train_labels.csv", index=False)
X_val.to_csv("santander-customer-transaction-prediction/X_val.csv")
y_val.to_csv("santander-customer-transaction-prediction/y_val.csv")

In [16]:
X_tr = pd.read_csv("santander-customer-transaction-prediction/smote_train.csv")

In [17]:
y_tr = pd.read_csv("santander-customer-transaction-prediction/smote_train_labels.csv")

In [18]:
(y_tr==1).sum() / len(y_tr)

target    0.5
dtype: float64

In [19]:
(y_tr==0).sum() / len(y_tr)

target    0.5
dtype: float64

In [20]:
len(X_tr)

1640

## Feature Extraction and Selection

In [21]:
pca = PCA(n_components="mle")

In [22]:
X_tr_pca = pca.fit_transform(X_tr, y_tr)

In [23]:
X_tr_pca.shape

(1640, 199)

Seems PCA doesn't work that well. Probably because the data is already PCA'd. We can assign None to the `X_tr_pca` to save memory.

In [24]:
X_tr_pca = None

1. Get rid of features with low variance.

In [25]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

In [26]:
sel.fit(X_tr)

VarianceThreshold(threshold=0.15999999999999998)

In [27]:
X_tr = X_tr.T[sel.get_support()].T

2. Before using other feature selection technique, you will want to scale the data so each feature is on the same scale.

In [28]:
scaler = MinMaxScaler()

In [29]:
scaler.fit(X_tr)

MinMaxScaler()

In [30]:
X_tr = pd.DataFrame(scaler.transform(X_tr), columns=X_tr.columns)

In [31]:
X_tr.describe()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
count,1640.0,1640.0,1640.0,1640.0,1640.0,1640.0,1640.0,1640.0,1640.0,1640.0,...,1640.0,1640.0,1640.0,1640.0,1640.0,1640.0,1640.0,1640.0,1640.0,1640.0
mean,0.484298,0.486401,0.497509,0.549312,0.517306,0.524165,0.520822,0.471642,0.509842,0.55499,...,0.516188,0.466043,0.544112,0.44347,0.456607,0.547979,0.553679,0.49345,0.516609,0.51764
std,0.167227,0.201318,0.188375,0.170031,0.178221,0.180053,0.170472,0.173872,0.184537,0.179973,...,0.170438,0.175952,0.143647,0.147735,0.197643,0.179881,0.190066,0.165676,0.193825,0.156429
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.363533,0.338053,0.348672,0.421166,0.382618,0.397562,0.402079,0.345408,0.375639,0.434958,...,0.401464,0.330817,0.448658,0.352724,0.307797,0.427594,0.412258,0.375797,0.375994,0.408093
50%,0.476199,0.488908,0.491266,0.556948,0.521628,0.53207,0.527925,0.472954,0.513921,0.57156,...,0.512856,0.452264,0.53882,0.442849,0.463522,0.54549,0.557166,0.478206,0.518427,0.518026
75%,0.598695,0.625344,0.630031,0.670537,0.647037,0.656041,0.640382,0.592484,0.655949,0.681808,...,0.640426,0.587421,0.639269,0.541548,0.604461,0.664187,0.6976,0.605081,0.661934,0.635635
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


3. Now we can try different feature selection methods.

### Univariate Selection

#### Select K-Best

In [32]:
kbest = SelectKBest(chi2, k=60)

In [33]:
kbest.fit_transform(X_tr, y_tr)

array([[0.65328866, 0.54401678, 0.63210424, ..., 0.80316279, 0.45199106,
        0.56533615],
       [0.67702888, 0.53155253, 0.42276865, ..., 0.21705513, 0.38208803,
        0.54240769],
       [0.84547594, 0.65384058, 0.36466253, ..., 0.3913426 , 0.7277524 ,
        0.59492384],
       ...,
       [0.45923902, 0.6277355 , 0.40210714, ..., 0.32162273, 0.49167386,
        0.49846152],
       [0.39781268, 0.39334116, 0.27244896, ..., 0.37910255, 0.53382775,
        0.72293177],
       [0.60216907, 0.62058437, 0.25656059, ..., 0.5529013 , 0.56802889,
        0.54777258]])

In [34]:
X_kbest = X_tr.T[kbest.get_support()].T

In [35]:
X_kbest

Unnamed: 0,var_2,var_11,var_13,var_15,var_17,var_18,var_21,var_22,var_23,var_34,...,var_168,var_171,var_177,var_179,var_180,var_181,var_187,var_188,var_190,var_197
0,0.653289,0.544017,0.632104,0.482818,0.645402,0.519969,0.523091,0.267472,0.955934,0.859660,...,0.483865,0.535553,0.602423,0.440458,0.382184,0.155034,0.771335,0.803163,0.451991,0.565336
1,0.677029,0.531553,0.422769,0.376011,0.300050,0.748519,0.251143,0.375638,0.462491,0.456293,...,0.302440,0.672559,0.225398,0.700106,0.479931,0.438735,0.352567,0.217055,0.382088,0.542408
2,0.845476,0.653841,0.364663,0.426987,0.377832,0.643604,0.511431,0.246256,0.827465,0.138549,...,0.729471,0.688232,0.727046,0.075255,0.569042,0.371873,0.245195,0.391343,0.727752,0.594924
3,0.198801,0.595995,0.468367,0.471728,0.282762,0.712845,0.562795,0.367903,0.358018,0.768102,...,0.315152,0.410528,0.520807,0.716374,0.503084,0.179391,0.212808,0.315002,0.752879,0.792208
4,0.612766,0.527613,0.448529,0.357401,0.626412,0.756256,0.407597,0.538442,0.468615,0.574351,...,0.684199,0.335712,0.572548,0.198269,0.618986,0.793864,0.655240,0.910755,0.597341,0.386215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1635,0.587817,0.295565,0.682153,0.433794,0.776126,0.482821,0.525354,0.578434,0.493749,0.306212,...,0.376378,0.709685,0.581343,0.214691,0.605687,0.456064,0.827138,0.495641,0.725918,0.694377
1636,0.524419,0.297729,0.348095,0.861415,0.535849,0.842317,0.416072,0.670583,0.196409,0.387376,...,0.413970,0.433390,0.662657,0.194689,0.671004,0.409329,0.636664,0.573869,0.670634,0.460644
1637,0.459239,0.627735,0.402107,0.693478,0.803933,0.571030,0.532583,0.598332,0.254117,0.457746,...,0.530926,0.612216,0.183454,0.434856,0.826271,0.653310,0.186559,0.321623,0.491674,0.498462
1638,0.397813,0.393341,0.272449,0.460021,0.743503,0.585031,0.371359,0.507367,0.268148,0.549034,...,0.684780,0.723972,0.360134,0.293393,0.578818,0.351192,0.496661,0.379103,0.533828,0.722932


In [36]:
X_kbest.to_csv("santander-customer-transaction-prediction/X_kbest.csv", index=False)

In [37]:
pd.read_csv("santander-customer-transaction-prediction/X_kbest.csv")

Unnamed: 0,var_2,var_11,var_13,var_15,var_17,var_18,var_21,var_22,var_23,var_34,...,var_168,var_171,var_177,var_179,var_180,var_181,var_187,var_188,var_190,var_197
0,0.653289,0.544017,0.632104,0.482818,0.645402,0.519969,0.523091,0.267472,0.955934,0.859660,...,0.483865,0.535553,0.602423,0.440458,0.382184,0.155034,0.771335,0.803163,0.451991,0.565336
1,0.677029,0.531553,0.422769,0.376011,0.300050,0.748519,0.251143,0.375638,0.462491,0.456293,...,0.302440,0.672559,0.225398,0.700106,0.479931,0.438735,0.352567,0.217055,0.382088,0.542408
2,0.845476,0.653841,0.364663,0.426987,0.377832,0.643604,0.511431,0.246256,0.827465,0.138549,...,0.729471,0.688232,0.727046,0.075255,0.569042,0.371873,0.245195,0.391343,0.727752,0.594924
3,0.198801,0.595995,0.468367,0.471728,0.282762,0.712845,0.562795,0.367903,0.358018,0.768102,...,0.315152,0.410528,0.520807,0.716374,0.503084,0.179391,0.212808,0.315002,0.752879,0.792208
4,0.612766,0.527613,0.448529,0.357401,0.626412,0.756256,0.407597,0.538442,0.468615,0.574351,...,0.684199,0.335712,0.572548,0.198269,0.618986,0.793864,0.655240,0.910755,0.597341,0.386215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1635,0.587817,0.295565,0.682153,0.433794,0.776126,0.482821,0.525354,0.578434,0.493749,0.306212,...,0.376378,0.709685,0.581343,0.214691,0.605687,0.456064,0.827138,0.495641,0.725918,0.694377
1636,0.524419,0.297729,0.348095,0.861415,0.535849,0.842317,0.416072,0.670583,0.196409,0.387376,...,0.413970,0.433390,0.662657,0.194689,0.671004,0.409329,0.636664,0.573869,0.670634,0.460644
1637,0.459239,0.627735,0.402107,0.693478,0.803933,0.571030,0.532583,0.598332,0.254117,0.457746,...,0.530926,0.612216,0.183454,0.434856,0.826271,0.653310,0.186559,0.321623,0.491674,0.498462
1638,0.397813,0.393341,0.272449,0.460021,0.743503,0.585031,0.371359,0.507367,0.268148,0.549034,...,0.684780,0.723972,0.360134,0.293393,0.578818,0.351192,0.496661,0.379103,0.533828,0.722932


#### Selection by Elimination

In [38]:
selector = RFE(DecisionTreeClassifier(random_state=0), n_features_to_select=60, step=5)

In [39]:
selector.fit(X_tr, y_tr)

RFE(estimator=DecisionTreeClassifier(random_state=0), n_features_to_select=60,
    step=5)

In [40]:
X_selector = X_tr.T[selector.get_support()].T

In [41]:
X_selector

Unnamed: 0,var_14,var_15,var_18,var_24,var_26,var_27,var_28,var_30,var_32,var_33,...,var_181,var_182,var_186,var_190,var_191,var_193,var_194,var_195,var_197,var_199
0,0.832107,0.482818,0.519969,0.654554,0.382916,0.488062,0.464349,0.259909,0.276313,0.296242,...,0.155034,0.471925,0.340159,0.451991,0.363152,0.591257,0.555826,0.821388,0.565336,0.580937
1,0.260986,0.376011,0.748519,0.393912,0.519113,0.013040,0.456417,0.728356,0.512316,0.512691,...,0.438735,0.366365,0.798617,0.382088,0.734055,0.445951,0.233586,0.275804,0.542408,0.575902
2,0.768836,0.426987,0.643604,0.705376,0.959453,0.478063,0.625453,0.704226,0.730572,0.269379,...,0.371873,0.332814,0.480033,0.727752,0.406143,0.504034,0.454644,0.679569,0.594924,0.503533
3,0.294807,0.471728,0.712845,0.164135,0.542182,0.595127,0.451935,0.574445,0.640107,0.331187,...,0.179391,0.680308,0.156545,0.752879,0.295423,0.658256,0.195710,0.968941,0.792208,0.567863
4,0.505265,0.357401,0.756256,0.128530,0.300051,0.243376,0.365780,0.464631,0.593780,0.650692,...,0.793864,0.625663,0.433713,0.597341,0.078460,0.303130,0.762218,0.766101,0.386215,0.598676
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1635,0.312690,0.433794,0.482821,0.573861,0.377901,0.192910,0.919510,0.784648,0.555195,0.245312,...,0.456064,0.734464,0.432045,0.725918,0.456621,0.493752,0.265741,0.584657,0.694377,0.413051
1636,0.769454,0.861415,0.842317,0.668984,0.221195,0.511442,0.516696,0.522246,0.746499,0.306226,...,0.409329,0.608600,0.324784,0.670634,0.622087,0.447885,0.407704,0.447768,0.460644,0.536302
1637,0.165300,0.693478,0.571030,0.387558,0.463918,0.379172,0.672593,0.455297,0.756914,0.310537,...,0.653310,0.683485,0.682166,0.491674,0.718138,0.490428,0.562601,0.389655,0.498462,0.419733
1638,0.181999,0.460021,0.585031,0.538117,0.489204,0.501078,0.469360,0.658681,0.307196,0.392536,...,0.351192,0.429149,0.500241,0.533828,0.440534,0.359601,0.697474,0.410029,0.722932,0.561837


In [42]:
X_selector.to_csv("santander-customer-transaction-prediction/X_selector.csv", index=False)

### Comparison

In [43]:
set(X_kbest.columns).difference(X_selector.columns)

{'var_109',
 'var_11',
 'var_112',
 'var_119',
 'var_120',
 'var_122',
 'var_124',
 'var_13',
 'var_131',
 'var_136',
 'var_139',
 'var_141',
 'var_144',
 'var_145',
 'var_147',
 'var_150',
 'var_159',
 'var_168',
 'var_17',
 'var_187',
 'var_188',
 'var_2',
 'var_21',
 'var_22',
 'var_23',
 'var_34',
 'var_38',
 'var_50',
 'var_52',
 'var_55',
 'var_58',
 'var_61',
 'var_65',
 'var_73',
 'var_76',
 'var_81',
 'var_97',
 'var_98'}

In [44]:
set(X_selector.columns).difference(X_kbest.columns)

{'var_110',
 'var_111',
 'var_114',
 'var_14',
 'var_146',
 'var_156',
 'var_163',
 'var_164',
 'var_165',
 'var_167',
 'var_176',
 'var_182',
 'var_186',
 'var_191',
 'var_193',
 'var_194',
 'var_195',
 'var_199',
 'var_24',
 'var_26',
 'var_27',
 'var_28',
 'var_30',
 'var_32',
 'var_33',
 'var_37',
 'var_39',
 'var_40',
 'var_45',
 'var_46',
 'var_54',
 'var_56',
 'var_60',
 'var_63',
 'var_64',
 'var_70',
 'var_85',
 'var_96'}

### Save Column Names

In [45]:
import pickle

In [46]:
with open("santander-customer-transaction-prediction/kbest_column_names", "wb") as f:
    pickle.dump(X_kbest.columns, f)

In [47]:
with open("santander-customer-transaction-prediction/selector_column_names", "wb") as f:
    pickle.dump(X_selector.columns, f)

## Baselines

In [48]:
stratified_base = DummyClassifier(strategy="stratified", random_state=0)
uniform_base = DummyClassifier(strategy="uniform", random_state=0)
frequent_base = DummyClassifier(strategy="most_frequent", random_state=0)

In [49]:
stratified_base.fit(X_tr, y_tr)
print(classification_report(y_val, stratified_base.predict(X_val)))
print(roc_auc_score(y_val, stratified_base.predict_proba(X_val)[:, 1]))

              precision    recall  f1-score   support

           0       0.92      0.49      0.64        91
           1       0.10      0.56      0.17         9

    accuracy                           0.50       100
   macro avg       0.51      0.53      0.40       100
weighted avg       0.84      0.50      0.60       100

0.525030525030525


In [50]:
uniform_base.fit(X_tr, y_tr)
print(classification_report(y_val, uniform_base.predict(X_val)))
print(roc_auc_score(y_val, uniform_base.predict_proba(X_val)[:, 1]))

              precision    recall  f1-score   support

           0       0.91      0.44      0.59        91
           1       0.09      0.56      0.15         9

    accuracy                           0.45       100
   macro avg       0.50      0.50      0.37       100
weighted avg       0.84      0.45      0.55       100

0.5


In [51]:
frequent_base.fit(X_tr, y_tr)
print(classification_report(y_val, frequent_base.predict(X_val)))
print(roc_auc_score(y_val, frequent_base.predict_proba(X_val)[:, 1]))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95        91
           1       0.00      0.00      0.00         9

    accuracy                           0.91       100
   macro avg       0.46      0.50      0.48       100
weighted avg       0.83      0.91      0.87       100

0.5


