# Prepare data for random forest

In [1]:
import pandas as pd
import numpy as np

In [2]:
base = pd.read_csv('./input/basestructuredf.csv')
st = pd.read_csv('./input/cleanedmonthly.csv')
# shops_em = pd.read_csv('./input/shop_embeddings.csv')
# items_em = pd.read_csv('./input/item_embeddings.csv')
shops_meta = pd.read_csv('./input/shops_meta.csv')
items_meta = pd.read_csv('./input/items_meta.csv')

In [3]:
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12368580 entries, 0 to 12368579
Data columns (total 5 columns):
 #   Column          Dtype
---  ------          -----
 0   shop_id         int64
 1   item_id         int64
 2   date_block_num  int64
 3   month           int64
 4   year            int64
dtypes: int64(5)
memory usage: 471.8 MB


In [4]:
st.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 853499 entries, 0 to 853498
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   year                  853499 non-null  int64  
 1   month                 853499 non-null  int64  
 2   shop_id               853499 non-null  int64  
 3   item_id               853499 non-null  int64  
 4   date_block_num        853499 non-null  int64  
 5   item_cnt_month        853499 non-null  float64
 6   median_monthly_price  853499 non-null  float64
dtypes: float64(2), int64(5)
memory usage: 45.6 MB


### Drop shops_id_month column from shops_em

In [102]:
shops_em.drop(columns='shop_id_month', axis=1, inplace=True)

In [103]:
shops_em.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 624 entries, 0 to 623
Data columns (total 12 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   se_1     624 non-null    float64
 1   se_2     624 non-null    float64
 2   se_3     624 non-null    float64
 3   se_4     624 non-null    float64
 4   se_5     624 non-null    float64
 5   se_6     624 non-null    float64
 6   se_7     624 non-null    float64
 7   se_8     624 non-null    float64
 8   se_9     624 non-null    float64
 9   se_10    624 non-null    float64
 10  shop_id  624 non-null    float64
 11  month    624 non-null    float64
dtypes: float64(12)
memory usage: 58.6 KB


## Merge sales data onto base, save value as sales

In [5]:
sales = base.merge(st, 'outer', on=['year', 'date_block_num','month','shop_id', 'item_id'])

In [6]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12368580 entries, 0 to 12368579
Data columns (total 7 columns):
 #   Column                Dtype  
---  ------                -----  
 0   shop_id               int64  
 1   item_id               int64  
 2   date_block_num        int64  
 3   month                 int64  
 4   year                  int64  
 5   item_cnt_month        float64
 6   median_monthly_price  float64
dtypes: float64(2), int64(5)
memory usage: 754.9 MB


In [7]:
sales.fillna(value=0, inplace=True)

In [8]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12368580 entries, 0 to 12368579
Data columns (total 7 columns):
 #   Column                Dtype  
---  ------                -----  
 0   shop_id               int64  
 1   item_id               int64  
 2   date_block_num        int64  
 3   month                 int64  
 4   year                  int64  
 5   item_cnt_month        float64
 6   median_monthly_price  float64
dtypes: float64(2), int64(5)
memory usage: 754.9 MB


## Add lagging values onto the dataframe

### Lagging values for item count

In [9]:
for i in range(3):
    sales[f'item_cnt_minus_{i+1}'] = sales.groupby(['shop_id', 'item_id']).item_cnt_month.shift(i+1)

### Lagging median price value

In [10]:
for i in range(1):
    sales[f'mnth_med_price_minus_{i+1}'] = sales.groupby(['shop_id', 'item_id']).median_monthly_price.shift(i+1)

### Drop Monthly median price column since it will not be available on test

In [11]:
sales.drop(columns='median_monthly_price', inplace=True)

### Remove dateblocknums 0, 1 & 2 as they have nan values

In [12]:
mask = sales.date_block_num.isin([0,1,2])

In [13]:
sales.drop(np.where(mask)[0], inplace=True)

In [14]:
sales

Unnamed: 0,shop_id,item_id,date_block_num,month,year,item_cnt_month,item_cnt_minus_1,item_cnt_minus_2,item_cnt_minus_3,mnth_med_price_minus_1
3,2,30,3,4,2013,0.0,1.0,0.0,0.0,359.0
4,2,30,4,5,2013,0.0,0.0,1.0,0.0,0.0
5,2,30,5,6,2013,1.0,0.0,0.0,1.0,0.0
6,2,30,6,7,2013,0.0,1.0,0.0,0.0,399.0
7,2,30,7,8,2013,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
12368575,59,22167,30,7,2015,0.0,0.0,0.0,0.0,0.0
12368576,59,22167,31,8,2015,0.0,0.0,0.0,0.0,0.0
12368577,59,22167,32,9,2015,0.0,0.0,0.0,0.0,0.0
12368578,59,22167,33,10,2015,0.0,0.0,0.0,0.0,0.0


In [15]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11308416 entries, 3 to 12368579
Data columns (total 10 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   shop_id                 int64  
 1   item_id                 int64  
 2   date_block_num          int64  
 3   month                   int64  
 4   year                    int64  
 5   item_cnt_month          float64
 6   item_cnt_minus_1        float64
 7   item_cnt_minus_2        float64
 8   item_cnt_minus_3        float64
 9   mnth_med_price_minus_1  float64
dtypes: float64(5), int64(5)
memory usage: 949.0 MB


## Merge meta deta for shops and items_sale with st

In [17]:
sales = pd.merge(left=sales, right=shops_meta, how='left', left_on='shop_id', right_on='shop_id')

In [18]:
sales.isna().sum()

shop_id                   0
item_id                   0
date_block_num            0
month                     0
year                      0
item_cnt_month            0
item_cnt_minus_1          0
item_cnt_minus_2          0
item_cnt_minus_3          0
mnth_med_price_minus_1    0
Unnamed: 0                0
months_since_open         0
dtype: int64

In [19]:
items_meta.drop(columns=['num_months','first_month', 'last_month', 'item_trend' ], inplace=True)

In [20]:
sales = pd.merge(left=sales, right=items_meta, how='left', left_on='item_id', right_on='item_id')

In [22]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11308416 entries, 0 to 11308415
Data columns (total 16 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   shop_id                 int64  
 1   item_id                 int64  
 2   date_block_num          int64  
 3   month                   int64  
 4   year                    int64  
 5   item_cnt_month          float64
 6   item_cnt_minus_1        float64
 7   item_cnt_minus_2        float64
 8   item_cnt_minus_3        float64
 9   mnth_med_price_minus_1  float64
 10  Unnamed: 0_x            int64  
 11  months_since_open       int64  
 12  Unnamed: 0_y            float64
 13  months_since_last       float64
 14  months_since_on         float64
 15  item_category_id        float64
dtypes: float64(9), int64(7)
memory usage: 1.4 GB


In [23]:
sales.rename(columns={'months_since_open':'s_months_since_open', 
                   'months_since_last': 'i_months_since_last', 
                   'months_since_on': 'i_months_since_on', 
                    'Unnamed: 0_x': 'd1', 'Unnamed: 0_y': 'd2'}, inplace=True)

In [24]:
sales.drop(columns=['d1', 'd2'], inplace=True)

In [25]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11308416 entries, 0 to 11308415
Data columns (total 14 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   shop_id                 int64  
 1   item_id                 int64  
 2   date_block_num          int64  
 3   month                   int64  
 4   year                    int64  
 5   item_cnt_month          float64
 6   item_cnt_minus_1        float64
 7   item_cnt_minus_2        float64
 8   item_cnt_minus_3        float64
 9   mnth_med_price_minus_1  float64
 10  s_months_since_open     int64  
 11  i_months_since_last     float64
 12  i_months_since_on       float64
 13  item_category_id        float64
dtypes: float64(8), int64(6)
memory usage: 1.3 GB


In [26]:
sales.isnull().sum()

shop_id                        0
item_id                        0
date_block_num                 0
month                          0
year                           0
item_cnt_month                 0
item_cnt_minus_1               0
item_cnt_minus_2               0
item_cnt_minus_3               0
mnth_med_price_minus_1         0
s_months_since_open            0
i_months_since_last       516096
i_months_since_on         516096
item_category_id          516096
dtype: int64

In [28]:
sales.loc[sales.i_months_since_last.isnull()].item_id.nunique()

384

## Merge embeddings

### Merge items_em onto sales, save as sales

In [114]:
sales = pd.merge(left=sales, right=items_em, how='left', left_on='item_id', right_on='item_id')

### Merge sales_em onto sales, save as sales

In [115]:
sales = pd.merge(left=sales, right=shops_em, how='left', left_on=['shop_id', 'month'], right_on=['shop_id', 'month'])

In [116]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39720532 entries, 0 to 39720531
Data columns (total 30 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   shop_id                 int64  
 1   item_id                 int64  
 2   date_block_num          int64  
 3   month                   int64  
 4   year                    int64  
 5   item_cnt_month          float64
 6   item_cnt_minus_1        float64
 7   item_cnt_minus_2        float64
 8   item_cnt_minus_3        float64
 9   mnth_med_price_minus_1  float64
 10  ie_1                    float64
 11  ie_2                    float64
 12  ie_3                    float64
 13  ie_4                    float64
 14  ie_5                    float64
 15  ie_6                    float64
 16  ie_7                    float64
 17  ie_8                    float64
 18  ie_9                    float64
 19  ie_10                   float64
 20  se_1                    float64
 21  se_2                    float

In [117]:
sales = sales.fillna(value=0)

## Reduce size for training convert all columns to float32 

In [118]:
sales = sales.astype(np.float32)

In [119]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39720532 entries, 0 to 39720531
Data columns (total 30 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   shop_id                 float32
 1   item_id                 float32
 2   date_block_num          float32
 3   month                   float32
 4   year                    float32
 5   item_cnt_month          float32
 6   item_cnt_minus_1        float32
 7   item_cnt_minus_2        float32
 8   item_cnt_minus_3        float32
 9   mnth_med_price_minus_1  float32
 10  ie_1                    float32
 11  ie_2                    float32
 12  ie_3                    float32
 13  ie_4                    float32
 14  ie_5                    float32
 15  ie_6                    float32
 16  ie_7                    float32
 17  ie_8                    float32
 18  ie_9                    float32
 19  ie_10                   float32
 20  se_1                    float32
 21  se_2                    float

# Random Forest

In [120]:
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import gc

In [136]:
def create_submission(tset, testdf, preds):
    tset['item_cnt_month'] = preds
    sub = testdf.merge(tset, 'inner', on=['shop_id', 'item_id']).loc[:,['ID', 'item_cnt_month']]
    sub.to_csv('./submission.csv', index=False)

def fi(model, df):
    f = model.feature_importances_
    cols = df.columns
    x = pd.DataFrame({'Features': cols, 'Importance': f})
    x.sort_values(by='Importance', ascending=False, ignore_index=True, inplace=True)
    return x

def train_rf_regressor(X, y, 
                       n_estimators=100, max_samples=5000000,
                       min_samples_leaf=2, oob_score=False,
                       n_jobs=-1, verbose=2, max_features=0.5):
    return RandomForestRegressor(n_estimators=n_estimators, 
                                 max_samples=max_samples, 
                                 min_samples_leaf=min_samples_leaf,
                                 oob_score=oob_score, n_jobs=-1, 
                                 verbose=verbose, max_features=max_features).fit(X,y)

def validate_rf_regressor(model, valid_x, valid_y):
    print(f'Out of bag score is {model.oob_score_}')
    pred = np.clip(model.predict(valid_x), 0, 20)
    rmse_score = np.sqrt(mean_squared_error(valid_y, pred))
    print(f'RMSE score on validation set is {rmse_score}')


In [122]:
test = pd.read_csv('./input/test.csv')

## Declare training, validation and test sets

In [123]:
test_idxs = np.where(sales.date_block_num == 34)[0]
# valid_idxs = np.where(sales.date_block_num == 33)[0]
train_idxs = np.where(sales.date_block_num<=33)[0]

In [124]:
test_set = sales.loc[test_idxs]
# valid_set = sales.loc[valid_idxs]
train_set = sales.loc[train_idxs]

In [125]:
del sales
gc.collect()

7412

In [126]:
train_set

Unnamed: 0,shop_id,item_id,date_block_num,month,year,item_cnt_month,item_cnt_minus_1,item_cnt_minus_2,item_cnt_minus_3,mnth_med_price_minus_1,...,se_1,se_2,se_3,se_4,se_5,se_6,se_7,se_8,se_9,se_10
0,2.0,0.0,3.0,4.0,2013.0,0.0,0.0,0.0,0.0,0.0,...,0.875029,0.000000,0.0,0.0,0.173911,0.0,0.331640,0.000000,3.121851,0.000000e+00
1,2.0,0.0,4.0,5.0,2013.0,0.0,0.0,0.0,0.0,0.0,...,0.937297,0.310035,0.0,0.0,0.593119,0.0,0.005625,0.074036,0.281494,4.613876e-06
2,2.0,0.0,5.0,6.0,2013.0,0.0,0.0,0.0,0.0,0.0,...,1.280416,0.095130,0.0,0.0,0.551135,0.0,0.030893,0.111134,0.286407,1.326328e-06
3,2.0,0.0,6.0,7.0,2013.0,0.0,0.0,0.0,0.0,0.0,...,0.932013,0.023892,0.0,0.0,0.501091,0.0,0.158260,0.157712,0.134353,2.885396e-07
4,2.0,0.0,7.0,8.0,2013.0,0.0,0.0,0.0,0.0,0.0,...,0.708386,0.022656,0.0,0.0,0.608331,0.0,0.241996,0.260690,0.308020,4.746925e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39720526,59.0,22169.0,29.0,6.0,2015.0,0.0,0.0,0.0,0.0,0.0,...,2.011285,0.028887,0.0,0.0,0.751535,0.0,0.000000,0.111709,0.296507,5.827133e-07
39720527,59.0,22169.0,30.0,7.0,2015.0,0.0,0.0,0.0,0.0,0.0,...,1.805772,0.000000,0.0,0.0,0.701925,0.0,0.129080,0.151716,0.536274,2.747553e-08
39720528,59.0,22169.0,31.0,8.0,2015.0,0.0,0.0,0.0,0.0,0.0,...,1.725836,0.000000,0.0,0.0,0.915292,0.0,0.198155,0.282312,0.620570,0.000000e+00
39720529,59.0,22169.0,32.0,9.0,2015.0,0.0,0.0,0.0,0.0,0.0,...,1.502945,0.000000,0.0,0.0,0.398154,0.0,0.184675,2.899171,0.013480,0.000000e+00


## Get Labels

In [127]:
train_y = train_set['item_cnt_month']
# valid_y = valid_set['item_cnt_month']

In [128]:
train_set.drop(columns=['item_cnt_month', 'date_block_num'], inplace=True)
# valid_set.drop(columns='item_cnt_month', inplace=True)
test_set.drop(columns=['item_cnt_month','date_block_num'], inplace=True)

In [129]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38479292 entries, 0 to 39720530
Data columns (total 28 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   shop_id                 float32
 1   item_id                 float32
 2   month                   float32
 3   year                    float32
 4   item_cnt_minus_1        float32
 5   item_cnt_minus_2        float32
 6   item_cnt_minus_3        float32
 7   mnth_med_price_minus_1  float32
 8   ie_1                    float32
 9   ie_2                    float32
 10  ie_3                    float32
 11  ie_4                    float32
 12  ie_5                    float32
 13  ie_6                    float32
 14  ie_7                    float32
 15  ie_8                    float32
 16  ie_9                    float32
 17  ie_10                   float32
 18  se_1                    float32
 19  se_2                    float32
 20  se_3                    float32
 21  se_4                    float

In [130]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1241240 entries, 31 to 39720531
Data columns (total 28 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   shop_id                 1241240 non-null  float32
 1   item_id                 1241240 non-null  float32
 2   month                   1241240 non-null  float32
 3   year                    1241240 non-null  float32
 4   item_cnt_minus_1        1241240 non-null  float32
 5   item_cnt_minus_2        1241240 non-null  float32
 6   item_cnt_minus_3        1241240 non-null  float32
 7   mnth_med_price_minus_1  1241240 non-null  float32
 8   ie_1                    1241240 non-null  float32
 9   ie_2                    1241240 non-null  float32
 10  ie_3                    1241240 non-null  float32
 11  ie_4                    1241240 non-null  float32
 12  ie_5                    1241240 non-null  float32
 13  ie_6                    1241240 non-null  float32
 14  

In [144]:
rfr = train_rf_regressor(train_set, train_y, n_estimators=150, max_samples=500000)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 150building tree 2 of 150
building tree 3 of 150
building tree 4 of 150
building tree 5 of 150
building tree 6 of 150
building tree 7 of 150
building tree 8 of 150

building tree 9 of 150
building tree 10 of 150
building tree 11 of 150
building tree 12 of 150
building tree 13 of 150
building tree 14 of 150
building tree 15 of 150
building tree 16 of 150
building tree 17 of 150
building tree 18 of 150
building tree 19 of 150
building tree 20 of 150
building tree 21 of 150
building tree 22 of 150
building tree 23 of 150
building tree 24 of 150
building tree 25 of 150
building tree 26 of 150
building tree 27 of 150
building tree 28 of 150
building tree 29 of 150
building tree 30 of 150
building tree 31 of 150
building tree 32 of 150


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min


building tree 33 of 150
building tree 34 of 150
building tree 35 of 150
building tree 36 of 150
building tree 37 of 150
building tree 38 of 150
building tree 39 of 150
building tree 40 of 150
building tree 41 of 150
building tree 42 of 150
building tree 43 of 150
building tree 44 of 150
building tree 45 of 150
building tree 46 of 150
building tree 47 of 150
building tree 48 of 150
building tree 49 of 150
building tree 50 of 150
building tree 51 of 150
building tree 52 of 150building tree 53 of 150

building tree 54 of 150
building tree 55 of 150
building tree 56 of 150
building tree 57 of 150
building tree 58 of 150
building tree 59 of 150
building tree 60 of 150
building tree 61 of 150
building tree 62 of 150
building tree 63 of 150
building tree 64 of 150
building tree 65 of 150
building tree 66 of 150
building tree 67 of 150
building tree 68 of 150
building tree 69 of 150
building tree 70 of 150
building tree 71 of 150
building tree 72 of 150
building tree 73 of 150
building tree 74

[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.5min finished


In [139]:
# rfr.oob_score_

In [148]:
fi(rfr, train_set)

Unnamed: 0,Features,Importance
0,item_cnt_minus_1,0.303966
1,item_cnt_minus_2,0.140656
2,item_cnt_minus_3,0.106372
3,se_1,0.057401
4,ie_1,0.045773
5,ie_7,0.03682
6,shop_id,0.028466
7,se_5,0.028419
8,se_8,0.027615
9,se_9,0.024552


In [145]:
test_set.drop(columns=['item_cnt_month'], inplace=True)

In [146]:
preds = np.clip(rfr.predict(test_set), 0, 20)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    4.5s finished


In [147]:
create_submission(test_set, test, preds)

In [90]:
sub = pd.read_csv('./submission.csv')

In [81]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ID              214200 non-null  int64  
 1   item_cnt_month  214200 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 3.3 MB


In [82]:
sample_sub = pd.read_csv('./input/sample_submission.csv')

In [83]:
sample_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214200 entries, 0 to 214199
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ID              214200 non-null  int64  
 1   item_cnt_month  214200 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 3.3 MB
