In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from pylightgbm.models import GBMRegressor
from IPython.display import display
import json
import pickle
import sys

ID = 'id'
TARGET = 'loss'
NFOLDS = 5
SEED = 0
NROWS = None
DATA_DIR = "../../input"

TRAIN_FILE = "{0}/train.csv".format(DATA_DIR)
TEST_FILE = "{0}/test.csv".format(DATA_DIR)
SUBMISSION_FILE = "{0}/sample_submission.csv".format(DATA_DIR)
USE_PICKLED = True

if not USE_PICKLED:
    print("Loading training data from {}".format(TRAIN_FILE))
    train = pd.read_csv(TRAIN_FILE, nrows=NROWS)
    print("Loading test data from {}".format(TEST_FILE))
    test = pd.read_csv(TEST_FILE, nrows=NROWS)
    
    y_train = train[TARGET].ravel()
    
    train.drop([ID, TARGET], axis=1, inplace=True)
    test.drop([ID], axis=1, inplace=True)
    
    print("Data shapes: Train = {}, Test = {}".format(train.shape, test.shape))
    
    ntrain = train.shape[0]
    ntest = test.shape[0]
    train_test = pd.concat((train, test)).reset_index(drop=True)
    
    features = train.columns
    
    cats = [feat for feat in features if 'cat' in feat]
    for feat in cats:
        train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]
    
    print ("Head ( train_test ) : ")
    print (train_test.head())

    x_train = np.array(train_test.iloc[:ntrain,:])
    x_test = np.array(train_test.iloc[ntrain:,:])
    with open('data.pkl', 'wb') as pkl_file:
        pickle.dump( (x_train, x_test, y_train), pkl_file)
else:
    with open('data.pkl', 'rb') as pkl_file:
        (x_train, x_test, y_train) = pickle.load(pkl_file)
        ntrain = x_train.shape[0]
        ntest  = x_test.shape[0]


kf = KFold(ntrain, n_folds=NFOLDS, shuffle=False, random_state=SEED)


class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))


class LightgbmWrapper(object):
    def __init__(self, seed=0, params=None):
        self.params = params
        self.clf = GBMRegressor(**params)
        self.params['seed'] = seed

    def train(self, x_train, y_train):
        if self.params['application'] == "regression":
            self.clf.fit(x_train, np.log1p(y_train))
        else:
            self.clf.fit(x_train, y_train)

    def predict(self, x):
        if self.params['application'] == "regression":
            return np.expm1(self.clf.predict(x))
        return self.clf.predict(x)


def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
