# Kaggle Challenge
## Santander Value Prediction Challenge
### Predict the value of transactions for potential customers.
#### https://www.kaggle.com/c/santander-value-prediction-challenge/data

In [65]:
# Import Statements
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
import os

In [3]:
if not os.path.isdir("output"):
    os.mkdir("output")

In [4]:
def rmsle(actual, predicted):
    assert actual.shape == predicted.shape
    return np.sqrt(np.sum(np.square(np.log(actual + 1) - np.log(predicted + 1)), axis=0) / actual.shape[0])

In [5]:
def save(id_col, prediction, filename):
    assert id_col.shape == prediction.shape
    dump = np.asarray(np.transpose(np.vstack((id_col, prediction))))
    np.savetxt("output/%s" % (filename), dump, delimiter=',', fmt="%s", header="ID,target", comments="")

In [56]:
# Data
train = pd.read_csv("data/train.csv")
print("Train Dimensions: ", train.shape)

test = pd.read_csv("data/test.csv")
print("Test Dimensions: ", test.shape)

Train Dimensions:  (4459, 4993)
Test Dimensions:  (49342, 4992)


#### Data Manipulation

In [53]:
train.describe()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [7]:
# Check if any missing values
train.isnull().sum().sort_values(ascending=False) # No missing values

9fc776466    0
083640132    0
26e2c24e3    0
c3726f249    0
a682ef110    0
b452ba57e    0
08448215c    0
fee2d3bf9    0
c2d200f0e    0
f0aa40974    0
51cab733f    0
7af000ac2    0
1930cefda    0
6dae32858    0
c5aa7c575    0
225fa9d61    0
9f69ae59f    0
78c57d7cd    0
2af3668d1    0
293e2698e    0
020a817ab    0
197cb48af    0
2dfea2ff3    0
53a550111    0
de7063efa    0
f52409b3e    0
d0f65188c    0
65119177e    0
2d60e2f7a    0
58ad51def    0
            ..
b87e3036b    0
1af96abeb    0
875ad1c4a    0
22eb11620    0
30992dccd    0
a3c187bb0    0
adf03173b    0
33ed23348    0
6bf90e4f5    0
b0fcfeab8    0
ea3f3029c    0
adbe5fd20    0
21b0cdc34    0
7f0d863ba    0
9847e14d8    0
98d0d2971    0
4ceef6dbd    0
346e3d609    0
fbd6e0a0b    0
69831c049    0
8c8616b62    0
007d71f12    0
f1e0ada11    0
99fc30923    0
4d3fb93d9    0
8e978ee65    0
c30399758    0
e70581bed    0
b22eb2036    0
ID           0
Length: 4993, dtype: int64

In [22]:
# Since most columns have several zeros, check variance.
train_var = train.var().sort_values()
train_0_var_feat = train_var[train_var == 0] # get columns that have zero variance
print(train_0_var_feat.shape) # there are 256 columns that have zero variance
drop_cols = train_0_var_feat.keys() # get those zero variance columns
print(drop_cols)

(256,)
Index(['d9a8615f3', 'a3d5c2c2a', '1bd3a4e92', '611d81daa', 'da35e792b',
       '6d07828ca', '3d7780b1c', '96eb14eaf', '113fd0206', '754c502dd',
       ...
       '00fcf67e4', 'b281a62b9', '6c16efbb8', '643e42fcb', '217cd3838',
       '047ebc242', '6fa0b9dab', 'e5649663e', 'ae846f332', '64d036163'],
      dtype='object', length=256)


In [57]:
# Drop features
train_drop_id = train.drop("ID", axis=1) # drop ID columns
train_drop = train_drop_id.drop(drop_cols, axis=1) # drop zero variance columns
print(train_drop.shape)

x_test_drop_id = test.drop("ID", axis=1)
x_test_drop = x_test_drop_id.drop(drop_cols, axis=1)
print(x_test_drop.shape)

(4459, 633)
(49342, 632)


In [58]:
# Split into x and y
y_train = train_drop["target"]
train_drop_tar = train_drop.drop("target", axis=1)
print(train_drop_tar.shape)

(4459, 632)


In [27]:
# Compute proportion of zeros in each features
train_0_prop_feat = (train_drop_tar == 0).astype(int).sum(axis=0).sort_values(ascending=False) / train_drop_tar.shape[0]
drop_cols = train_0_prop_feat[train_0_prop_feat > 0.9].keys()
print(drop_cols)

Index(['3d390e8b9', 'cde9c35e8', 'f8d75792f', 'aa2e796b4', '969d32625',
       '8225f7e05', '71b637714', '6e598606d', '1530f6138', '29714c47d',
       ...
       '2b2b5187e', '11114a47a', '41bc25fef', '2cff4bf0c', 'cfe749e26',
       'bf59c51c3', '7b672b310', '9886b4d22', 'a3382e205', '009319104'],
      dtype='object', length=4359)


In [29]:
# Drop features
train_drop_2 = train_drop_tar.drop(drop_cols, axis=1)
print(train_drop_2.shape)

x_test_drop_2 = x_test_drop.drop(drop_cols, axis=1)
print(x_test_drop_2.shape)

(4459, 376)
(49342, 376)


In [38]:
# Since almost every column has high range, take logarithm.
train_drop_log = np.log(train_drop_2 + 1)
x_test_drop_log = np.log(x_test_drop_2 + 1)
# train_drop_log.describe()

In [41]:
# Split dataset
x_train = train_drop_log
x_test = x_test_drop_log

#### Machine Learning Models

##### Random Forest

In [51]:
rf = RandomForestRegressor()
rf.fit(x_train, y_train)
print("Training RMSLE: ", rmsle(y_train, rf.predict(x_train)))
y_predict = rf.predict(x_test)
save(test["ID"], y_predict, "asc.csv")

Training RMSLE:  1.0601824414406409


##### AdaBoost

In [82]:
ada = AdaBoostRegressor()
ada.fit(x_train, y_train)
print("Training RMSLE: ", rmsle(y_train, ada.predict(x_train)))
y_predict = ada.predict(x_test)
save(test["ID"], y_predict, "ada_1.csv") # 2.68

Training RMSLE:  2.680091864163244


##### XGBoost

In [9]:
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
print("Training RMSLE: ", rmsle(y_train, xgb.predict(x_train)))
y_predict = xgb.predict(x_test)
save(test["ID"], y_predict, "xgb_1.csv") # 1.91

Training RMSLE:  1.8393296647239583


##### Light GBM

In [19]:
lgbm = LGBMRegressor()
lgbm.fit(x_train, y_train)
print("Training RMSLE: ", rmsle(y_train, lgbm.predict(x_train))) # 1.41 for default, 1.9 for lr=0.01
y_predict = lgbm.predict(x_test)
save(test["ID"], y_predict, "lgbm_1.csv") # 1.91

  This is separate from the ipykernel package so we can avoid doing imports until


Training RMSLE:  1.4119796674962117
