# Redo data preprocessing

During previous sections, the result is not promising, here I will re-preprocess the data

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.combine import SMOTETomek
from time import time



## 1. Load data

In [3]:
# load the data
df = pd.read_csv("train.csv")
print(df.shape)

(200000, 202)


In [4]:
df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [5]:
df_data, df_label = df[df.columns[2:]], df[df.columns[1]]

In [6]:
df_data.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [7]:
df_label = pd.DataFrame(df_label)
df_label.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


In [0]:
# df_label.to_csv("data/label_nosmote.csv")

## 2. Preprocessing

+ Add sum / min / max / mean / std / skew / kurtosis / median / moving average
    + For moving average, read np.ma.average for more details
+ Variance of each column
+ Here is also a magic feature, read https://www.kaggle.com/c/santander-customer-transaction-prediction/discussion/87486#latest-506429 for more details


In [18]:
# add some basic columns
def basic_preprocessing(df):
    columns = df.columns
    df['sum'] = df[columns].sum(axis=1)  
    df['min'] = df[columns].min(axis=1)
    df['max'] = df[columns].max(axis=1)
    df['mean'] = df[columns].mean(axis=1)
    df['std'] = df[columns].std(axis=1)
    df['skew'] = df[columns].skew(axis=1)
    df['kurt'] = df[columns].kurtosis(axis=1)
    df['med'] = df[columns].median(axis=1)
    
    # add round features
    for column in columns:
        df['r1_' + column] = np.round(df[column], 1)
        df['r2_' + column] = np.round(df[column], 2)
    return df

In [9]:
df_data_prep = df_data.copy()
df_data_prep = basic_preprocessing(df_data_prep)
print(df_data_prep.shape)
df_data_prep.head()

(200000, 608)


Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,r1_var_195,r2_var_195,r1_var_196,r2_var_196,r1_var_197,r2_var_197,r1_var_198,r2_var_198,r1_var_199,r2_var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,-2.4,-2.4,7.9,7.88,8.6,8.56,12.8,12.78,-1.1,-1.09
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,2.0,2.03,8.1,8.13,8.8,8.79,18.4,18.36,2.0,1.95
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,3.1,3.14,-6.5,-6.52,8.3,8.27,14.7,14.72,0.4,0.4
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,-1.3,-1.27,-2.9,-2.93,10.3,10.29,18.0,17.97,-9.0,-9.0
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.5,-1.51,3.9,3.93,9.5,9.5,18.0,18.0,-8.8,-8.81


---
## 3. Split training and validation set

20% validation data

In [15]:
def split_train_val(df, train_path=None, val_path=None):
    df_train, df_val = df[:160000], df[160000:]
    print(df_train.shape, df_val.shape)
    df_train.to_csv(train_path)
    df_val.to_csv(val_path)
    return df_train, df_val

In [17]:
train_path = "train_data.csv"
val_path = "val_data.csv"
train_data, val_data = split_train_val(df_data_prep, train_path, val_path)
train_path = "train_label.csv"
val_path = "val_label.csv"
train_label, val_label = split_train_val(df_label, train_path, val_path)

(160000, 608) (40000, 608)
(160000, 1) (40000, 1)


---

## 4. Resample and augmentation

In [0]:
# this function is modified from https://www.kaggle.com/jiweiliu/lgb-2-leaves-augment
def augment(df, features, t):
    x = df.iloc[:,:-1].values
    y = df['target'].values
    
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn]).astype(np.uint64)
    
    features = pd.DataFrame(x, columns=features)
    labels = pd.DataFrame(y, columns=['target'])
    combined = pd.concat([features, labels], axis=1)
    return combined

In [0]:
def resample_and_augment(df_data, df_label, t=3, zero_fraction=0.75):
    """
    we will sample only 1/4 data from 0s, then combine them with 1s
    remember to combine features with label before feeding into this function
    """
    # combine data and target
    features = df_data.columns
    df_data = pd.concat([df_data, df_label], axis=1)
    df_ones = df_data[df_data['target'] == 1]
    df_zeros = df_data[df_data['target'] == 0]
    print("Original 1s {}, 0s {}".format(df_ones.shape[0], df_zeros.shape[0]))
    # augment 1s
    aug_ones = augment(df_ones, features, t)
    print("Now we have {} 1s".format(aug_ones.shape[0]))
    df_zeros_part = df_zeros.sample(frac=zero_fraction)
    print("part of 0s: {}".format(df_zeros_part.shape[0]))
    # combine and shuffle
    df_combine = pd.concat([df_zeros_part, aug_ones]).sample(frac=1)
    print("Combined: {}".format(df_combine.shape))
    train_data_aug,  = df_combine.iloc[:,:-1]
    train_label_aug = df_combine.loc[:,'target']
    return train_data_aug, train_label_aug

In [0]:
train_data_aug, train_label_aug = resample_and_augment(train_data, train_label, t=3, zero_fraction=0.75)
train_data_aug.shape

Original 1s 16049, 0s 143951
Now we have 64196 1s
part of 0s: 107963
Combined: (172159, 609)


(172159, 608)

In [0]:
train_label_aug = pd.DataFrame(train_label_aug)
train_label_aug.head()

Unnamed: 0,target
90426,0.0
36931,0.0
26352,0.0
3559,0.0
59031,1.0


In [0]:
# train_data_aug.to_csv("data/train_data_aug.csv")
# train_label_aug.to_csv("data/train_label_aug.csv")

## 3. PCA
Here I reduce the dimension to 60

In [0]:
def dimension_reduction(df, n_components=200):
    pca = PCA(n_components=n_components)
    df_processed = pca.fit_transform(df)
    df_processed = pd.DataFrame(df_processed, 
                                columns = ['var_pca_{}'.format(i) for i in range(n_components)])
    return df_processed

In [0]:
df_data_pca = dimension_reduction(df_data_prep)
print(df_data_pca.shape)
df_data_pca.head()

(200000, 200)


Unnamed: 0,var_pca_0,var_pca_1,var_pca_2,var_pca_3,var_pca_4,var_pca_5,var_pca_6,var_pca_7,var_pca_8,var_pca_9,...,var_pca_190,var_pca_191,var_pca_192,var_pca_193,var_pca_194,var_pca_195,var_pca_196,var_pca_197,var_pca_198,var_pca_199
0,-105.860128,-5.400743,37.417087,3.935585,3.77736,4.637825,-18.165755,14.889708,-29.852264,6.936472,...,-0.087427,0.108608,-0.786297,0.853786,-0.416321,-0.124596,0.560179,0.046222,-0.049892,-0.254745
1,-54.355348,-70.943602,-28.984502,-30.938672,1.802316,-2.775054,2.662385,-21.835452,-10.575077,21.371416,...,0.06977,-0.95839,0.152369,0.402786,0.310333,-0.335566,-0.435642,0.488061,-0.054247,0.210107
2,110.48636,24.541343,3.941811,23.961675,-15.154132,-9.355453,4.965564,35.465815,4.862782,0.919669,...,0.24064,-0.281654,0.025007,0.780489,-0.774717,0.37703,-0.357587,0.188147,0.243865,-0.336344
3,69.503042,-10.319518,-26.484079,22.524163,-37.881464,-15.064502,27.688344,14.940958,14.542261,-4.162862,...,-0.109669,-0.399767,-0.412265,-0.472733,0.074901,-0.754768,-0.589803,-0.404724,-0.30419,-0.307361
4,13.596137,-93.729275,-43.654269,35.223109,44.294176,-10.231406,19.801837,-28.270031,31.285243,-27.998473,...,0.405957,2.634627,-0.538385,-0.123605,-0.193773,0.195125,-0.09121,-0.423971,-0.280581,0.246918


In [0]:
train_data, val_data = split_train_val(df_data_pca)
train_label, val_label = split_train_val(df_label)

(160000, 200) (40000, 200)
(160000, 1) (40000, 1)
