In [40]:
import numpy as np
import pandas as pd
import pywt
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

### Wavelet Transform Class.
DictT.dot is a method corresponding to the DWT operator.

In [41]:
class DictT(object):

    def __init__(self, name, level):
            self.name = name
            self.level = level
            self.sizes = []

    def dot(self, mat):

        m = []

        if mat.shape[0] != mat.size:
            for i in xrange(mat.shape[1]):
                c = pywt.wavedec(mat[:, i], self.name, level=self.level)
                self.sizes.append(map(len, c))
                c = np.concatenate(c)
                m.append(c)
            return np.asarray(m).T
        else:
            c = pywt.wavedec(mat, self.name, level=self.level)
            self.sizes.append(map(len, c))
            return np.concatenate(c)

### Import data.

In [42]:
training = pd.read_csv('Data/training_input.csv', delimiter=',')
output = pd.read_csv('Data/training_output.csv', delimiter=';')
testing = pd.read_csv('Data/testing_input.csv', delimiter=',')

TODO: Interpolate the training input ?

In [43]:
testing = testing.interpolate(axis=1)

In [44]:
training = training.drop(pd.isnull(training).any(1).nonzero()[0]).reset_index(drop=True)
training = pd.merge(training, output, on='ID', how='inner')

x = training.drop(["ID", "date", "product_id", "TARGET"], axis=1).values
y = training["TARGET"].values

In [45]:
print "Full labelled set :\n", x.shape, "\n", y.shape

Full labelled set :
(513947, 54) 
(513947,)


### Create wavelet dictionary.
NB: 'db1' is the classical DFT operator.

In [46]:
wave_name = 'db17'
wave_level = None
wavelet_operator_t = DictT(level=wave_level, name=wave_name)

basis_t = wavelet_operator_t.dot(np.identity(x.shape[1]))
basis_t /= np.sqrt(np.sum(basis_t ** 2, axis=0))
basis = basis_t.T

### Selecting the best model.
Training, validation split the transformed data, using Fourier/Wavelet.

In [51]:
x_train, x_val, y_train, y_val = train_test_split(
    x.dot(basis), y, test_size=0.2, random_state=0)

In [52]:
print "Training set :\n", x_train.shape, "\n", y_train.shape
print "Validation set :\n", x_val.shape, "\n", y_val.shape

Training set :
(411157, 54) 
(411157,)
Validation set :
(102790, 54) 
(102790,)


Supervised learning method.    
_TODO:_ Use GridSearchCV for the best parameters.

In [53]:
%%time

"""
reg_grid = {
    "max_depth": [10, 20, 40, None],
    "max_features": [20, 30, 40, 'auto'],
    "min_samples_split": [1, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "bootstrap": [True, False]}

reg = GridSearchCV(RandomForestRegressor(n_estimators=50, n_jobs=-1),
                   param_grid=reg_grid, n_jobs=-1, verbose=5)
"""

reg = RandomForestRegressor(n_estimators=15, 
                            max_features=20, 
                            min_samples_split=1, 
                            bootstrap=True, 
                            max_depth=20, 
                            min_samples_leaf=1,
                            n_jobs=-1,
                            verbose=5)
reg.fit(x_train, y_train)
y_val_pred = reg.predict(x_val)

# params = reg.best_params_
# print params

[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.4min finished
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.3s finished


building tree 1 of 15
building tree 2 of 15
building tree 3 of 15
building tree 4 of 15
building tree 5 of 15
building tree 6 of 15
building tree 7 of 15
building tree 8 of 15
building tree 9 of 15
building tree 10 of 15
building tree 11 of 15
building tree 12 of 15
building tree 13 of 15
building tree 14 of 15
building tree 15 of 15
CPU times: user 4min 9s, sys: 1.21 s, total: 4min 10s
Wall time: 1min 26s


### Mean Absolute Percentage Error.

In [55]:
def mean_absolute_percentage_error(y_true, y_pred): 

    """
    Note: does not handle mix 1d representation
    if _is_1d(y_true): 
        y_true, y_pred = _check_1d_array(y_true, y_pred)
    """

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print "MAPE :", mean_absolute_percentage_error(y_val, y_val_pred)

MAPE : 30.9492386531


### Score the test dataset.

In [None]:
x_test = testing.drop(["ID", "date", "product_id"], axis=1).values
reg.fit(x.dot(basis), y)
y_test = reg.predict(x_test.dot(basis))

df_test = pd.DataFrame(y_test).reset_index()
df.columns = ["ID", "TARGET"]
df["ID"] = df["ID"].apply(lambda r: r+1)
df.to_csv("Submission.csv", index=False)