# Kaggle Challenge
## Santander Value Prediction Challenge
### Predict the value of transactions for potential customers.
#### https://www.kaggle.com/c/santander-value-prediction-challenge/data

In [14]:
# Import Statements
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
import os

In [None]:
if not os.path.isdir("output"):
    os.mkdir("output")

In [15]:
def rmsle(actual, predicted):
    assert actual.shape == predicted.shape
    return np.sqrt(np.sum(np.square(np.log(actual + 1) - np.log(predicted + 1)), axis=0) / actual.shape[0])

In [16]:
def save(id_col, prediction, filename):
    assert id_col.shape == prediction.shape
    dump = np.asarray(np.transpose(np.vstack((id_col, prediction))))
    np.savetxt("output/%s" % (filename), dump, delimiter=',', fmt="%s", header="ID,target", comments="")

In [17]:
# Data
train = pd.read_csv("data/train.csv")
print("Train Dimensions: ", train.shape)

test = pd.read_csv("data/test.csv")
print("Test Dimensions: ", test.shape)

Train Dimensions:  (4459, 4993)
Test Dimensions:  (49342, 4992)


#### Data Manipulation

In [18]:
train.drop("ID", axis=1, inplace=True)
x_train = train.drop("target", axis=1)
y_train = train["target"]

x_test = test.drop("ID", axis=1)

#### Machine Learning Models

##### Random Forest

In [28]:
rf = RandomForestRegressor(n_estimators=5)
rf.fit(x_train, y_train)
print("Training RMSLE: ", rmsle(y_train, rf.predict(x_train))) # 1.034 for default, 1.08 for n_estimators=20, 1.02 for n_estimators=5, 1.038 for n_estimators=3,4
y_predict = rf.predict(x_test)
save(test["ID"], y_predict, "rf_5_n_est_4.csv") # 1.74 for default

Training RMSLE:  1.0104716969109326


##### AdaBoost

In [82]:
ada = AdaBoostRegressor()
ada.fit(x_train, y_train)
print("Training RMSLE: ", rmsle(y_train, ada.predict(x_train)))
y_predict = ada.predict(x_test)
save(test["ID"], y_predict, "ada_1.csv") # 2.68

Training RMSLE:  2.680091864163244


##### XGBoost

In [9]:
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
print("Training RMSLE: ", rmsle(y_train, xgb.predict(x_train)))
y_predict = xgb.predict(x_test)
save(test["ID"], y_predict, "xgb_1.csv") # 1.91

Training RMSLE:  1.8393296647239583


##### Light GBM

In [19]:
lgbm = LGBMRegressor()
lgbm.fit(x_train, y_train)
print("Training RMSLE: ", rmsle(y_train, lgbm.predict(x_train))) # 1.41 for default, 1.9 for lr=0.01
y_predict = lgbm.predict(x_test)
save(test["ID"], y_predict, "lgbm_1.csv") # 1.91

  This is separate from the ipykernel package so we can avoid doing imports until


Training RMSLE:  1.4119796674962117
