In [7]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
import lightgbm 


In [11]:
sales = pd.read_csv('./sales_train.csv.gz')
shops = pd.read_csv('./shops.csv')
items = pd.read_csv('./items.csv')
item_cats = pd.read_csv('./item_categories.csv')

In [13]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
%matplotlib inline 

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 50)

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from tqdm import tqdm_notebook

from itertools import product


def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [15]:
from itertools import product
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
#del grid, gb 
gc.collect();

In [17]:
del all_data

In [19]:
gc.collect()

115

In [21]:
sales = sales[sales.item_price<100000]
sales = sales[sales.item_cnt_day<=1000]

In [23]:
sales_m = sales.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': 'sum','item_price': np.mean}).reset_index()
sales_m = pd.merge(grid, sales_m, on=['date_block_num','shop_id','item_id'], how='left').fillna(0)
# adding the category id too
sales_m = pd.merge(sales_m, items, on=['item_id'], how='left')

for type_id in ['item_id','shop_id','item_category_id']:
    for column_id,aggregator,aggtype in [('item_price',np.mean,'avg'),('item_cnt_day',np.sum,'sum'),('item_cnt_day',np.mean,'avg')]:

        mean_df = sales_m.groupby([type_id,'date_block_num']).aggregate(aggregator).reset_index()[[column_id,type_id,'date_block_num']]
        mean_df.columns = [type_id+'_'+aggtype+'_'+column_id,type_id,'date_block_num']

        sales_m = pd.merge(sales_m,mean_df,on=['date_block_num',type_id],how='left')
        


In [25]:
sales_m = downcast_dtypes(sales_m)


In [27]:
lag_variables  = list(sales_m.columns[7:])+['item_cnt_day']
lags = [1 ,2 ,3]# ,4, 5, 12]
for lag in lags:
    sales_new_df = sales_m.copy()
    sales_new_df.date_block_num+=lag
    sales_new_df = sales_new_df[['date_block_num','shop_id','item_id']+lag_variables]
    sales_new_df.columns = ['date_block_num','shop_id','item_id']+ [lag_feat+'_lag_'+str(lag) for lag_feat in lag_variables]
    sales_m = pd.merge(sales_m, sales_new_df,on=['date_block_num','shop_id','item_id'] ,how='left')

In [32]:
sales_means = sales_m

In [33]:
for feat in sales_means.columns:
    if 'item_cnt' in feat:
        sales_means[feat]=sales_means[feat].fillna(0)
    elif 'item_price' in feat:
        sales_means[feat]=sales_means[feat].fillna(sales_means[feat].median())

In [34]:
cols_to_drop = lag_variables[:-1] + ['item_name','item_price']

In [35]:
X_train = sales_means[sales_means['date_block_num']<33].drop(cols_to_drop, axis=1)
X_cv =  sales_means[sales_means['date_block_num']==33].drop(cols_to_drop, axis=1)

In [37]:

def clip(x):
    if x>40:
        return 40
    elif x<0:
        return 0
    else:
        return x
X_train['item_cnt_day'] = X_train.apply(lambda x: clip(x['item_cnt_day']),axis=1)
X_cv['item_cnt_day'] = X_cv.apply(lambda x: clip(x['item_cnt_day']),axis=1)

In [47]:
sales_means.columns

Index([u'shop_id', u'item_id', u'date_block_num', u'item_price',
       u'item_cnt_day', u'item_name', u'item_category_id',
       u'item_id_avg_item_price', u'item_id_sum_item_cnt_day',
       u'item_id_avg_item_cnt_day', u'shop_id_avg_item_price',
       u'shop_id_sum_item_cnt_day', u'shop_id_avg_item_cnt_day',
       u'item_category_id_avg_item_price',
       u'item_category_id_sum_item_cnt_day',
       u'item_category_id_avg_item_cnt_day', u'item_id_avg_item_price_lag_1',
       u'item_id_sum_item_cnt_day_lag_1', u'item_id_avg_item_cnt_day_lag_1',
       u'shop_id_avg_item_price_lag_1', u'shop_id_sum_item_cnt_day_lag_1',
       u'shop_id_avg_item_cnt_day_lag_1',
       u'item_category_id_avg_item_price_lag_1',
       u'item_category_id_sum_item_cnt_day_lag_1',
       u'item_category_id_avg_item_cnt_day_lag_1', u'item_cnt_day_lag_1',
       u'item_id_avg_item_price_lag_2', u'item_id_sum_item_cnt_day_lag_2',
       u'item_id_avg_item_cnt_day_lag_2', u'shop_id_avg_item_price_lag_2',
 

In [48]:
y_train = sales_means[sales_means['date_block_num']<33]['item_cnt_day']

In [49]:
y_cv = sales_means[sales_means['date_block_num']==33]['item_cnt_day']

In [51]:
#import pickle
#pickle.dump(X_train, open('X_train.pickle', 'wb'))

In [54]:
from xgboost import XGBClassifier

model = XGBClassifier()


In [None]:
model.fit(X_train, y_train)