In [1]:
import numpy as np
import pandas as pd
import pywt
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

### Wavelet Transform Class.
DictT.dot is a method corresponding to the DWT operator.

In [17]:
class DictT(object):

    def __init__(self, name, level):
            self.name = name
            self.level = level
            self.sizes = []

    def dot(self, mat):

        m = []

        if mat.shape[0] != mat.size:
            for i in xrange(mat.shape[1]):
                c = pywt.wavedec(mat[:, i], self.name, level=self.level)
                self.sizes.append(map(len, c))
                c = np.concatenate(c)
                m.append(c)
            return np.asarray(m).T
        else:
            c = pywt.wavedec(mat, self.name, level=self.level)
            self.sizes.append(map(len, c))
            return np.concatenate(c)

### Import data.

In [18]:
train = pd.read_csv('Data/training_input.csv', delimiter=',')
output = pd.read_csv('Data/training_output.csv', delimiter=';')
test = pd.read_csv('Data/testing_input.csv', delimiter=',')

Select rows with no or several missing entries.

In [19]:
train = pd.merge(train, output, on='ID', how='inner')
train_filled = train.drop(pd.isnull(train).any(1).nonzero()[0]).reset_index(drop=True)
train_missing = train[~train["ID"].isin(train_filled["ID"].tolist())]

Training dataset for our supervised learning method.

In [20]:
x = train_filled.drop(["ID", "date", "product_id", "TARGET"], axis=1).values
y = train_filled["TARGET"].values

In [21]:
print "Full labelled set :\n", x.shape, "\n", y.shape

Full labelled set :
(513947, 54) 
(513947,)


Average target per product — this will be used when controlling the error of predictions for missing data.

In [22]:
average_target_per_product = train.groupby("product_id").mean()["TARGET"].reset_index()
average_target_per_product.head()

Unnamed: 0,product_id,TARGET
0,236,24157732.04392
1,238,5825229.92577
2,242,19649724.759512
3,243,8249545.655585
4,261,30402004.119214


### Create wavelet dictionary.
NB: 'db1' is the classical DFT operator.

In [23]:
wave_name = 'db20'
wave_level = None
wavelet_operator_t = DictT(level=wave_level, name=wave_name)

basis_t = wavelet_operator_t.dot(np.identity(x.shape[1]))
basis_t /= np.sqrt(np.sum(basis_t ** 2, axis=0))
basis = basis_t.T

### Selecting the best model.
Training, validation split the transformed data, using Fourier/Wavelet.

In [129]:
x_train, x_val, y_train, y_val = train_test_split(
    x.dot(basis), y, test_size=0.3)

In [130]:
print "Training set :\n", x_train.shape, "\n", y_train.shape
print "Validation set :\n", x_val.shape, "\n", y_val.shape

Training set :
(359762, 54) 
(359762,)
Validation set :
(154185, 54) 
(154185,)


Supervised learning method.    
_TODO:_ Use GridSearchCV for the best parameters.

In [16]:
%%time

"""
reg_grid = {
    "max_depth": [10, 20, 40, None],
    "max_features": [20, 30, 40, 'auto'],
    "min_samples_split": [1, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "bootstrap": [True, False]}

reg = GridSearchCV(RandomForestRegressor(n_estimators=50, n_jobs=-1),
                   param_grid=reg_grid, n_jobs=-1, verbose=5)
"""

reg = RandomForestRegressor(n_estimators=15, 
                            max_features=20, 
                            min_samples_split=1, 
                            bootstrap=True, 
                            max_depth=20, 
                            min_samples_leaf=1,
                            n_jobs=-1,
                            verbose=5)
reg.fit(x_train, y_train)
y_val_pred = reg.predict(x_val)

# params = reg.best_params_
# print params

CPU times: user 43 µs, sys: 3 µs, total: 46 µs
Wall time: 47 µs


### Mean Absolute Percentage Error.

In [71]:
def mean_absolute_percentage_error(y_true, y_pred): 

    """
    Note: does not handle mix 1d representation
    if _is_1d(y_true): 
        y_true, y_pred = _check_1d_array(y_true, y_pred)
    """

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print "MAPE :", mean_absolute_percentage_error(y_val, y_val_pred)

### Score the test dataset.

If training rows contain no missing entry, use the supervised learning method reg.    
Otherwise, simply output the average target per product.

In [44]:
test_filled = test.drop(pd.isnull(test).any(1).nonzero()[0]).reset_index(drop=True)
test_missing = test[~test["ID"].isin(test_filled["ID"].tolist())]

In [104]:
reg.fit(x.dot(basis), y)

For all test rows without any missing values, predict the actual data.

In [123]:
def predict_filled_values(filled_dataframe, reg):
    
    y = reg.predict(filled_dataframe.drop(["ID", "date", "product_id"], axis=1).values.dot(basis))
    df_y = pd.DataFrame(y)
    df_y.columns = ["TARGET"]
    df_y["ID"] = filled_dataframe["ID"]
    
    return df_y[["ID", "TARGET"]]

For test rows with missing values, just set the output to the average target per product.

In [112]:
def predict_missing_values(missing_dataframe):
    
    df = pd.merge(missing_dataframe.reset_index(), average_target_per_product, on="product_id")
    df = df.sort_values(by="index").reset_index()
    return df[["ID", "TARGET"]]

Concatenate predictions to obtain the final result.

In [113]:
def predict_all_values(filled, missing, reg):
    
    return pd.concat([predict_filled_values(filled, reg), 
                      predict_missing_values(missing)], axis=0).sort_values(by="ID")

In [114]:
y_test = predict_all_values(test_filled, test_missing, reg)
y_test.head()

[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    1.5s finished


Unnamed: 0,ID,TARGET
0,618557,20148047.066664
1,618558,4388666.535405
2,618559,7456074.494341
3,618560,5233019.985268
4,618561,15595865.176292


In [None]:
# df_test.to_csv("Submission.csv", sep=";", index=False)