In [21]:
import pandas as pd
import numpy as np

In [22]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [23]:
train_master = pd.read_pickle('./input/train_2.pkl')

In [24]:
train_master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34516800 entries, 3 to 37752749
Data columns (total 10 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   shop_id                 int8   
 1   item_id                 int16  
 2   date_block_num          int8   
 3   month                   int8   
 4   item_cnt_month          float64
 5   item_cat                int8   
 6   item_cnt_minus_1        float64
 7   mnth_med_price_minus_1  float64
 8   item_cnt_mean           float64
 9   mnth_med_price_mean     float64
dtypes: float64(5), int16(1), int8(4)
memory usage: 1.7 GB


## Train Random forest regressor and see baseline performance

### Declare training and validation sets

In [25]:
mask_valid = train_master.date_block_num == 33
mask_test = train_master.date_block_num ==34

In [26]:
xset = train_master[~(mask_valid | mask_test)].copy()
vset = train_master[mask_valid].copy()
tset = train_master[mask_test].copy()

In [27]:
def getDataAndLabel(df):
    label = np.clip(df['item_cnt_month'], 0, 20)
    data = df.drop(columns=['item_cnt_month'])
    return (data, label)

In [28]:
xs, y = getDataAndLabel(xset)
valid_xs, valid_y = getDataAndLabel(vset)

In [29]:
valid_xs.shape, valid_y.shape

((1078650, 9), (1078650,))

In [30]:
xs.shape, y.shape

((32359500, 9), (32359500,))

## Initialize Random forest regressor

In [41]:
rfr = RandomForestRegressor(n_estimators=100, max_samples=2000000, min_samples_leaf=2,
                            oob_score=True, n_jobs=-1, verbose=1)

In [42]:
rfr.fit(xs, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 33.9min finished


RandomForestRegressor(max_samples=2000000, min_samples_leaf=2, n_jobs=-1,
                      oob_score=True, verbose=1)

In [43]:
rfr.oob_score_

0.5237640024994314

In [46]:
pred = np.clip(rfr.predict(valid_xs), 0, 20)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    4.3s finished


In [47]:
rmse = np.sqrt(mean_squared_error(valid_y, pred))

In [48]:
rmse

0.43410374375738836

In [49]:
pred_train = np.clip(rfr.predict(xs),0, 20)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  2.8min finished


In [50]:
np.sqrt(mean_squared_error(y, pred_train))

0.4421604508053528

In [51]:
def rf_feature_importance(m, df):
    return pd.DataFrame({'cols': df.columns, 'imp': m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [52]:
rf_feature_importance(rfr, xs)

Unnamed: 0,cols,imp
5,item_cnt_minus_1,0.362025
1,item_id,0.2096
7,item_cnt_mean,0.114618
0,shop_id,0.076659
2,date_block_num,0.063681
8,mnth_med_price_mean,0.053228
3,month,0.048341
4,item_cat,0.041756
6,mnth_med_price_minus_1,0.030092
