In [1]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("train.tsv")
test = pd.read_csv("test.tsv")
sample_submission = pd.read_csv("sample_submission.tsv")

In [24]:
frac = 1

train_part = train.sample(frac=frac, random_state=42)
train_part = train_part.sort_values(['year', 'week'], ascending=[1, 1])

X = train_part.drop(['Num', 'y'], axis=1)
y = train_part['y']

In [4]:
def smape(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / (abs(y_true) + abs(y_pred)))) * 200

In [5]:
def score_model(model, X, y):
    tscv = TimeSeriesSplit(n_splits=5)
    cross_val_score = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        cross_val_score.append(smape(y_test, preds))
    cross_val_score = np.array(cross_val_score)
    score = cross_val_score.mean()
    return score

In [25]:
X = X[['f' + str(i) for i in range(21, 31)] + ['year', 'week', 'shift', 'item_id']]

In [18]:
n_estimators = [100, 150, 200, 250]
min_samples_leaf = [2, 3, 4, 5]
min_samples_split = [2, 3, 4]

In [19]:
for es in n_estimators:
    for msl in min_samples_leaf:
        for mss in min_samples_split:
            print(es, msl, mss, score_model(RandomForestRegressor(n_estimators=es, 
                                                                  min_samples_leaf=msl, 
                                                                  min_samples_split=mss), X, y))

100 2 2 29.5829252452
100 2 3 29.5265758624
100 2 4 29.5707088857
100 3 2 29.7331111605
100 3 3 29.6190686033
100 3 4 29.5095409984
100 4 2 29.5954945389
100 4 3 29.5769662138
100 4 4 29.560059956
100 5 2 29.4308822994
100 5 3 29.5974847543
100 5 4 29.5380366453
150 2 2 29.5580318802
150 2 3 29.5565060387
150 2 4 29.6937005442
150 3 2 29.561633144
150 3 3 29.5053672268


KeyboardInterrupt: 

In [13]:
X_test = test[['f' + str(i) for i in range(21, 31)] + ['year', 'week', 'shift', 'item_id']]
X_test.head()

Unnamed: 0,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,year,week,shift,item_id
0,1510.0,580.0,969.0,1635.0,895.0,2140.0,1182.0,1020.0,1293.0,1290.0,2015,3,3,20447918
1,22055.0,14235.0,21195.0,18280.0,18270.0,15851.0,16920.0,18320.0,24116.0,21307.0,2015,3,3,20447902
2,302165.0,162232.0,221622.0,256605.0,240047.0,236630.0,206697.0,245652.0,286179.0,285904.0,2015,3,3,20447732
3,39055.0,14445.0,22450.0,22093.0,31175.0,23355.0,15358.0,18930.0,29643.0,33970.0,2015,3,3,20443951
4,120.0,130.0,60.0,30.0,50.0,20.0,20.0,30.0,0.0,0.0,2015,3,3,20443944


In [26]:
model = RandomForestRegressor(n_estimators=300, min_samples_leaf=5, min_samples_split=2)
model.fit(X, y)
preds = model.predict(X_test)

In [30]:
sample_submission['y'] = preds
sample_submission.head(5)

Unnamed: 0,Num,y
0,348622,1935.971867
1,348623,26301.814167
2,348624,337065.350598
3,348625,30689.245286
4,348626,334.685418


In [31]:
sample_submission['y'] = sample_submission['y'].map(lambda x: x if x > 0 else 0.0)

In [32]:
sample_submission.to_csv("garkavyy_big_random_forest_submission.tsv", sep=',', index=False)