In [1]:
import pandas as pd
import numpy as np

import gzip
import cPickle as pickle
import glob
import csv

import lightgbm as lgb
import sklearn

from sklearn.preprocessing import LabelEncoder, StandardScaler, normalize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, StandardScaler
from scipy.sparse import coo_matrix, csr_matrix, load_npz

import calendar
import datetime, time
import sys, os

import matplotlib
import matplotlib.pyplot as plt
from IPython.display import clear_output
% matplotlib inline

In [2]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch.utils.data import DataLoader
import torch.optim as optim

import torchsample
from torchsample import TensorDataset
from torchsample.modules import ModuleTrainer
from torchsample.callbacks import EarlyStopping, ReduceLROnPlateau
from torchsample.regularizers import L1Regularizer, L2Regularizer
from torchsample.constraints import UnitNorm
from torchsample.initializers import XavierUniform



In [3]:
in_dir = "/home/data/kaggle-zillow/processed/"

out_dir = "/home/data/kaggle-zillow/submissions/"

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Load formatted data

In [4]:
# training data

X = load_npz(in_dir + "/X_train.npz").tocsr()
y = np.load(in_dir + "/y_train.npz")['arr_0']

X_test = load_npz(in_dir + "/X_test.npz").tocsr()

with np.load(in_dir + "/features.npz") as data:
    features, features_dum = data['arr_0'][0], data['arr_0'][1]

In [5]:
# only retain important features as identified in prior analysis using gradient boosting
# this is to reduce computational complexity, memory footprint, and variance

RETAIN_TOP_FEATURES = 35

if RETAIN_TOP_FEATURES:
    features_imp = pd.read_csv(out_dir + "/feature_importance.csv")
    features_imp.sort_values("importance", inplace=True, ascending=False)
    features = features_imp['feature'][:RETAIN_TOP_FEATURES]
    idx_sel = [i for i,f in enumerate(features_dum)\
                   if len([x for x in features if x in f])>0]
    features_dum = [features_dum[i] for i in idx_sel]
    X, X_test = X[:,idx_sel], X_test[:,idx_sel]
    
    print "Retaining %d important features" % len(features)

Retaining 35 important features


In [6]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, 
                                        test_size=0.1, random_state=42)

print X.shape, X_train.shape, X_eval.shape

(90275, 1443) (81247, 1443) (9028, 1443)


In [7]:
ulimit = np.percentile(y_train, 99.5)
llimit = np.percentile(y_train, 0.5)
idx = (y_train > llimit) & (y_train < ulimit)
X_train = X_train[idx,:]
y_train = y_train[idx]

In [8]:
X_train.shape, X_test.shape

((80430, 1443), (2985217, 1443))

In [9]:
# standardize by dividing by standard deviation

std_scaler = StandardScaler(with_mean=False)
X_train = std_scaler.fit_transform(X_train)
X_eval = std_scaler.transform(X_eval)

# X_train = normalize(X_train, norm='l2', axis=1)

# Train a MLP in Pytorch

#### Set up data loader

In [10]:
batch_size = 256

X_train_tn = torch.from_numpy(X_train.todense()).double()
y_train_tn = torch.from_numpy(y_train).double()
X_eval_tn  = torch.from_numpy(X_eval.todense()).double()
y_eval_tn  = torch.from_numpy(y_eval).double()

train_dataset = TensorDataset(X_train_tn, y_train_tn)
train_loader = DataLoader(train_dataset, 
                          batch_size=batch_size, 
                          shuffle=True)

eval_dataset = TensorDataset(X_eval_tn, y_eval_tn)
eval_loader = DataLoader(eval_dataset, 
                         batch_size=batch_size, 
                         shuffle=True)

#### Define model and optimization

In [11]:
def define_mlp(input_size, hidden_sizes=[256,256]):
    D = nn.Sequential(
    nn.Linear(input_size, 128),
    nn.LeakyReLU(0.2),
#     nn.Linear(128, 256),
#     nn.LeakyReLU(0.2),
#     nn.Linear(256, 128),
#     nn.LeakyReLU(0.2),
#     nn.Linear(128, 128),
#     nn.LeakyReLU(0.2),
    nn.Dropout(0.2),
    nn.Linear(128, 1))
    return D
    
    
input_size = X_test.shape[1]
learning_rate = 0.001
    
model = define_mlp(input_size)
model = model.double()

criterion = nn.L1Loss()

#### Train model

In [12]:
if torch.has_cudnn:
    print("Transferring to GPU")
    X_train_tn = X_train_tn.cuda()
    y_train_tn = y_train_tn.cuda()
    X_eval_tn = X_eval_tn.cuda()
    y_eval_tn = y_eval_tn.cuda()
    model = model.cuda()
    criterion = criterion.cuda()


Transferring to GPU


In [14]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in xrange(20):
    # trainning
    for batch_idx, (x, target) in enumerate(train_loader):
        optimizer.zero_grad()
        x, target = Variable(x.cuda()), Variable(target.cuda())
        score = model(x)
        loss = criterion(score, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            # clear_output(wait=True)
            msg = '>>> epoch: {}, batch {}, train error: {:.6f}\r'.format(epoch, batch_idx, loss.data[0])
            print msg
    # testing
    ave_err = 0
    for batch_idx, (x, target) in enumerate(eval_loader):
        x, target = Variable(x.cuda(), volatile=True), Variable(target.cuda(), volatile=True)
        score = model(x)
        err = criterion(score, target)
        ave_err += err.data[0]
    ave_err /= len(eval_loader)
    msg = '>>> epoch: {}, test error: {:.6f}\r'.format(epoch, ave_err)
    print msg



>>> epoch: 0, batch 0, train error: 1.102054
>>> epoch: 0, batch 100, train error: 0.111045
>>> epoch: 0, batch 200, train error: 0.082850
>>> epoch: 0, batch 300, train error: 0.070353
>>> epoch: 0, test error: 0.086160
>>> epoch: 1, batch 0, train error: 0.073622
>>> epoch: 1, batch 100, train error: 0.062475
>>> epoch: 1, batch 200, train error: 0.063458
>>> epoch: 1, batch 300, train error: 0.075653
>>> epoch: 1, test error: 0.073471
>>> epoch: 2, batch 0, train error: 0.061452
>>> epoch: 2, batch 100, train error: 0.054480
>>> epoch: 2, batch 200, train error: 0.056791
>>> epoch: 2, batch 300, train error: 0.055529
>>> epoch: 2, test error: 0.071338
>>> epoch: 3, batch 0, train error: 0.062262
>>> epoch: 3, batch 100, train error: 0.063259
>>> epoch: 3, batch 200, train error: 0.053788
>>> epoch: 3, batch 300, train error: 0.053160
>>> epoch: 3, test error: 0.070481
>>> epoch: 4, batch 0, train error: 0.056448
>>> epoch: 4, batch 100, train error: 0.055212
>>> epoch: 4, batch 200,

# Make predictions & format submission

In [None]:
i_nonz, j_nonz = X_test.nonzero()

In [None]:
v_nonz = torch.DoubleTensor([X_test[i,j] for i,j in zip(i_nonz, j_nonz)])

In [None]:
X_test_sp = torch.sparse.DoubleTensor(X_test.size())

# Experiments using torchsample

In [None]:
test_dataset = TensorDataset(X_eval_tn, y_eval_tn)
test_loader = DataLoader(eval_dataset, 
                         batch_size=batch_size, 
                         shuffle=False)


In [None]:
model(X_test)

In [None]:
trainer = ModuleTrainer(model)


callbacks = [EarlyStopping(patience=10),
             ReduceLROnPlateau(factor=0.5, patience=5)]
# regularizers = [L1Regularizer(scale=1e-3, module_filter='*'),
#                 L2Regularizer(scale=1e-5, module_filter='*')]
constraints = [UnitNorm(frequency=3, unit='batch', module_filter='*')]
initializers = [XavierUniform(bias=False, module_filter='*')]
# metrics = [nn.L1Loss()]

trainer.compile(loss=criterion,
                optimizer='adam',
                regularizers=regularizers,
                # constraints=constraints,
                # metrics=metrics,
                initializers=initializers,
                callbacks=callbacks)

trainer.fit_loader(train_loader, eval_loader, num_epoch=20, verbose=1)