In [1]:
import numpy as np
import pandas as pd
import pywt
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV

### Wavelet Transform Class.
DictT.dot is a method corresponding to the DWT operator.

In [2]:
class DictT(object):

    def __init__(self, name, level):
            self.name = name
            self.level = level
            self.sizes = []

    def dot(self, mat):

        m = []

        if mat.shape[0] != mat.size:
            for i in xrange(mat.shape[1]):
                c = pywt.wavedec(mat[:, i], self.name, level=self.level)
                self.sizes.append(map(len, c))
                c = np.concatenate(c)
                m.append(c)
            return np.asarray(m).T
        else:
            c = pywt.wavedec(mat, self.name, level=self.level)
            self.sizes.append(map(len, c))
            return np.concatenate(c)

### Import data.

In [5]:
training = pd.read_csv('Data/training_input.csv', delimiter=',')
output = pd.read_csv('Data/training_output.csv', delimiter=';')
testing = pd.read_csv('Data/testing_input.csv', delimiter=',')

In [6]:
training_filled = training.drop(pd.isnull(training).any(1).nonzero()[0]).reset_index(drop=True)
training_filled = pd.merge(training_filled, output, on='ID', how='inner')

x = training_filled.drop(["ID", "date", "product_id", "TARGET"], axis=1).values
y = training_filled["TARGET"].values

In [7]:
print "Full labelled set :\n", x.shape, "\n", y.shape

Full labelled set :
(513947L, 54L) 
(513947L,)


### Create wavelet dictionary.
NB: 'db1' is the classical DFT operator.

In [8]:
wave_name = 'db20'
wave_level = None
wavelet_operator_t = DictT(level=wave_level, name=wave_name)

basis_t = wavelet_operator_t.dot(np.identity(x.shape[1]))
basis_t /= np.sqrt(np.sum(basis_t ** 2, axis=0))
basis = basis_t.T

### Selecting the best model.
Training, validation split the transformed data, using Fourier/Wavelet.

In [9]:
x_train, x_val, y_train, y_val = train_test_split(
    x.dot(basis), y, test_size=0.3)

In [10]:
print "Training set :\n", x_train.shape, "\n", y_train.shape
print "Validation set :\n", x_val.shape, "\n", y_val.shape

Training set :
(359762L, 54L) 
(359762L,)
Validation set :
(154185L, 54L) 
(154185L,)


Supervised learning method.    
_TODO:_ Use GridSearchCV for the best parameters.

In [11]:
%%time

"""
reg_grid = {
    "max_depth": [10, 20, 40, None],
    "max_features": [20, 30, 40, 'auto'],
    "min_samples_split": [1, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "bootstrap": [True, False]}

reg = GridSearchCV(RandomForestRegressor(n_estimators=50, n_jobs=-1),
                   param_grid=reg_grid, n_jobs=-1, verbose=5)
"""

reg = RandomForestRegressor(n_estimators=15, 
                            max_features=20, 
                            min_samples_split=1, 
                            bootstrap=True, 
                            max_depth=20, 
                            min_samples_leaf=1,
                            n_jobs=-1,
                            verbose=5)
reg.fit(x_train, y_train)
y_val_pred = reg.predict(x_val)

# params = reg.best_params_
# print params

[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   31.1s finished
[Parallel(n_jobs=8)]: Done  15 out of  15 | elapsed:    0.3s finished


building tree 1 of 15
building tree 2 of 15
building tree 3 of 15
building tree 4 of 15
building tree 5 of 15
building tree 6 of 15
building tree 7 of 15
building tree 8 of 15
building tree 9 of 15
building tree 10 of 15
building tree 11 of 15
building tree 12 of 15
building tree 13 of 15
building tree 14 of 15
building tree 15 of 15
Wall time: 31.8 s


### Mean Absolute Percentage Error.

In [13]:
def mean_absolute_percentage_error(y_true, y_pred): 

    """
    Note: does not handle mix 1d representation
    if _is_1d(y_true): 
        y_true, y_pred = _check_1d_array(y_true, y_pred)
    """

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print "MAPE on non missing:", mean_absolute_percentage_error(y_val, y_val_pred)

MAPE : 30.5255365345


### Fill missing values.

In [130]:
#Les donnes manquantes a predire
training_missing = training_temp[~training_temp["ID"].isin(training_filled["ID"].tolist())]
print ('Shape training_missing')
print training_missing.shape

## MEAN OF EACH LABEL GROUPED BY PROD ID ##
training_temp = pd.merge(training, output, on='ID', how='inner')
Mean=training_temp.groupby("product_id").mean()["TARGET"].reset_index()
print Mean.shape
Mean.head()

(352, 2)


Unnamed: 0,product_id,TARGET
0,236,24157732.04392
1,238,5825229.92577
2,242,19649724.759512
3,243,8249545.655585
4,261,30402004.119214


In [125]:
average_target_per_product = training_temp.groupby("product_id").mean()["TARGET"].reset_index()
average_target_per_product.head()
df_pred_missing = pd.merge(training_missing[["product_id","ID"]], 

                           average_target_per_product, on='product_id', how='inner')
df_pred_missing=df_pred_missing.drop(["product_id"],axis=1)
print('Shape df_pred_missing')
df_pred_missing.shape

Shape df_pred_missing


(99273, 2)

### Training Score


In [126]:
df_predicted = pd.DataFrame(y_val_pred)
df_predicted.columns = ["TARGET"]
df_predicted["ID"] = training_filled["ID"]
cols = df_predicted.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_predicted = df_predicted[cols]
df_predicted.head()
df_predicted=df_predicted.append(df_pred_missing)
df_predicted.sort_values(by=["ID"],axis=0,inplace=True)
df_predicted.shape

(253458, 2)

In [120]:
y_pred=df_predicted["TARGET"].values
print y_pred.shape
y_val=output["TARGET"].values
print "MAPE on non missing:",  mean_absolute_percentage_error(y_val, y_pred)

 (253458L,)
MAPE on non missing:

ValueError: operands could not be broadcast together with shapes (613220,) (253458,) 

### Testing Score

In [78]:
testing_filled = testing.drop(pd.isnull(testing).any(1).nonzero()[0]).reset_index(drop=True)
testing_missing = testing[~testing["ID"].isin(testing_filled["ID"].tolist())]
testing_missing.head()

Unnamed: 0,ID,date,product_id,09:30:00,09:35:00,09:40:00,09:45:00,09:50:00,09:55:00,10:00:00,...,13:10:00,13:15:00,13:20:00,13:25:00,13:30:00,13:35:00,13:40:00,13:45:00,13:50:00,13:55:00
8,618565,1885,305,217974.0102,85480.004,83343.0039,59808.0028,64050.0,34095.9984,115182.0,...,46750.0,,65906.0,38268.0,36142.0,29764.0,106450.005,97888.0046,6384.0003,38304.0018
13,618570,1885,389,,,,4623800.0,11895392.4016,4787485.8394,3363008.0,...,253700.0,463934.9843,417501.0141,1034086.9651,1193136.0,1289775.0,573017.0193,905616.0304,1164240.0392,476000.0
26,618583,1885,435,,1893935.9472,1350027.9624,502320.014,1309547.9634,1758900.0,1521672.0426,...,143899.994,252104.9965,352996.0049,129492.0036,208742.0087,136819.0038,281150.9844,375232.0208,1229100.051,600339.0166
28,618585,1885,447,,303665.9893,136607.9952,227840.0,73970.0026,136704.0,51210.0018,...,25686.0009,423280.0,180243.0063,88691.0031,31482.0011,232308.0,129285.0,134890.0047,89000.9969,140924.0
59,618616,1885,839,242339.994,246256.9939,478608.0118,283290.007,97176.0048,230393.9886,161440.004,...,167916.0,87956.0,63952.0016,,31976.0008,15983.9996,151924.0,63984.0032,99950.0,75962.0


For all test rows without any missing values, predict the actual data.

In [17]:
x_test = testing_filled.drop(["ID", "date", "product_id"], axis=1).values
reg.fit(x.dot(basis), y)
y_test = reg.predict(x_test.dot(basis))

[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   48.4s finished
[Parallel(n_jobs=8)]: Done  15 out of  15 | elapsed:    0.7s finished


building tree 1 of 15
building tree 2 of 15
building tree 3 of 15
building tree 4 of 15
building tree 5 of 15
building tree 6 of 15
building tree 7 of 15
building tree 8 of 15
building tree 9 of 15
building tree 10 of 15
building tree 11 of 15
building tree 12 of 15
building tree 13 of 15
building tree 14 of 15
building tree 15 of 15


In [53]:
df_test = pd.DataFrame(y_test)
df_test.columns = ["TARGET"]
df_test["ID"] = testing_filled["ID"]
cols = df_test.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_test = df_test[cols]
df_test.head()
# df_test.to_csv("Submission.csv", sep=";", index=False)

Unnamed: 0,ID,TARGET
0,618557,17557246.736756
1,618558,5377269.929739
2,618559,7799878.91536
3,618560,5163657.08265
4,618561,18556182.437674


For test rows with missing values, just set the output to the average target per product.

In [97]:
df_test_missing = pd.merge(testing_missing[["product_id","ID"]], 

                           average_target_per_product, on='product_id', how='inner')
df_test_missing=df_test_missing.drop(["product_id"],axis=1)
df_test_missing.head()

Unnamed: 0,ID,TARGET
0,618565,5328192.005324
1,618917,5328192.005324
2,621528,5328192.005324
3,622481,5328192.005324
4,623724,5328192.005324


In [76]:
df_submission=df_test.append(df_test_missing)
df_submission.sort_values(by=["ID"],axis=0,inplace=True)
df_submission[1:10]

Unnamed: 0,ID,TARGET
1,618558,5377269.929739
2,618559,7799878.91536
3,618560,5163657.08265
4,618561,18556182.437674
5,618562,6326199.772999
6,618563,35825234.065468
7,618564,52795442.439919
0,618565,4450988.227835
8,618566,22945040.609989
