In [1]:
import numpy as np
import pandas as pd
from time import time
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

%matplotlib inline
np.random.seed(42)

In [2]:
from datetime import datetime,timedelta
def report(desc,train,pred,val_score,hours_diff=0,output_path='output/'):
    r = desc + '\n\n'
    features=sorted(train.columns)
    r += 'features %d: %s\n' % (train.shape[1],features)
    r += 'train records: %s\n' % train.shape[0]
    r += 'val score %.4f\n' % val_score
    
    f_name=(datetime.now()+timedelta(hours=hours_diff)).strftime('%Y-%m-%d-%H-%M-%S')
    r_file='%s%s.rpt'%(output_path,f_name)
    with open(r_file,'w') as f:
        f.write(r)
        
    o_file='%s%s.csv'%(output_path,f_name)
    pred.to_csv(o_file,index=False)
        
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [3]:
%%time
from feature import prepare,df_crossjoin,clip_y

sales_train=pd.read_csv('raw_data/sales_train.csv.gz')
sales_test=pd.read_csv('raw_data/test.csv.gz')
items=pd.read_csv('raw_data/items.csv')
# shops=pd.read_csv('raw_data/shops.csv')

label_col='item_cnt_month'
sales_train.rename(columns={'date_block_num':'block'},inplace=True)
val_block=np.max(sales_train.block)
# sales_train['time']=pd.to_datetime(sales_train.date,format='%d.%m.%Y')
# sales_train['year']=sales_train.time.apply(lambda t:t.year).astype('int16')
# sales_train['month']=sales_train.time.apply(lambda t:t.month).astype('int8')


sales_cross=[]
index_cols=['shop_id','item_id']
for d in sorted(sales_train.block.unique()):
    m_sales=sales_train[sales_train.block==d]
    m_si=pd.MultiIndex.from_product([m_sales['shop_id'].unique(),m_sales['item_id'].unique()],
                                          names=index_cols).to_frame(index=False)
    
    m_si=m_si.merge(m_sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':'sum'}),
                    on=index_cols,how='left').rename(columns={'item_cnt_day':label_col})
    m_si['block']=d
    sales_cross.append(m_si)


sales_test['block']=val_block+1
# sales_test['month']=sales_train.iloc[-1].month+1
# sales_test['year']=sales_train.iloc[-1].year


sales_cross.append(sales_test.drop('ID',axis=1))
sales=pd.concat(sales_cross)[lambda df:df.block>12]
sales.info(null_counts=True)
sales.fillna(0,inplace=True)



<class 'pandas.core.frame.DataFrame'>
Int64Index: 6291948 entries, 0 to 214199
Data columns (total 4 columns):
block             6291948 non-null int64
item_cnt_month    868080 non-null float64
item_id           6291948 non-null int64
shop_id           6291948 non-null int64
dtypes: float64(1), int64(3)
memory usage: 240.0 MB
CPU times: user 7.34 s, sys: 424 ms, total: 7.77 s
Wall time: 7.77 s


In [4]:
%%time
import gc
sales_p=sales.copy()

shop_month_sales=sales_train.groupby(['shop_id','block'],as_index=False).agg({
    'item_id':'nunique',
    'item_cnt_day':'sum'
}).rename(columns={
    'item_id':'shop_item_id_cnt_month',
    'item_cnt_day':'shop_item_cnt_month'
})
shop_month_sales['shop_item_avg_cnt_month']=shop_month_sales['shop_item_cnt_month']/shop_month_sales['shop_item_id_cnt_month']
shop_month_sales.drop(['shop_item_cnt_month','shop_item_id_cnt_month'],axis=1,inplace=True)
sales_p=sales_p.merge(shop_month_sales,on=['shop_id','block'],how='left')
shop_month_cols=shop_month_sales.columns.difference(['shop_id','block'])

item_month_sales=sales_train.groupby(['item_id','block'],as_index=False).agg({
    'shop_id':'nunique',
    'item_cnt_day':'sum',
    'item_price':lambda s:np.mean(s)
}).rename(columns={
    'shop_id':'item_shop_cnt_month',
    'item_cnt_day':'item_shop_sale_month'
})
item_month_sales['item_shop_avg_cnt_month']=item_month_sales['item_shop_sale_month']/item_month_sales['item_shop_cnt_month']
item_month_sales.drop(['item_shop_cnt_month','item_shop_sale_month'],axis=1,inplace=True)
sales_p=sales_p.merge(item_month_sales,on=['item_id','block'],how='left')
item_month_cols=item_month_sales.columns.difference(['item_id','block'])

# item_cat_sales=sales_train.join(items,on=['item_id'],rsuffix='_o').groupby(['item_category_id','block']).agg({
#     'item_cnt_day':'sum'
# }).rename(columns={'item_cnt_day':'item_cat_sales_month'})

# shop_sales=sales_train.groupby(['shop_id']).agg({
#     'item_id':'nunique'
# }).rename(columns={
#     'item_id':'shop_item_id_cnt'
# })
# sales_p=sales_p.join(shop_sales,on=['shop_id'],rsuffix='_other')

# month_sales=sales_train.groupby(['block']).agg({
#     'item_cnt_day':'sum'
# }).rename(columns={'item_cnt_day':'month_sale'})
# sales_p=sales_p.join(month_sales,on=['block'])


index_cols=['shop_id','item_id','block']
merge_cols=sales_p.columns.difference(index_cols)
for i in [1,2,3,
#           4,5,12
         ]:
    prev=sales_p[merge_cols.union(index_cols)].copy()
    prev.block+=i
    prev.rename(columns=lambda c:'prev_%s_%d'%(c,i) if c in merge_cols else c,inplace=True)
    sales_p=sales_p.merge(prev,on=index_cols,how='left')

sales_p=sales_p.merge(items.drop(['item_name'],axis=1),how='left',on='item_id')

item_cats=items.groupby(['item_category_id'],as_index=False).agg({
    'item_id':'count'
}).rename(columns={'item_id':'item_cat_items_cnt'})
sales_p=sales_p.merge(item_cats,on='item_category_id',how='left')

sales_p=downcast_dtypes(sales_p)
sales_p.info(null_counts=True)
sales_p.fillna(0,inplace=True)

drop_cols=set([label_col])
ext_cols=[shop_month_cols,item_month_cols]
for ec in ext_cols:
    drop_cols = drop_cols|set(ec)

val_block=np.max(sales_p.block)-1
X_train,X_val,X_test,y_train,y_val,y_test=sales_p.drop(drop_cols,axis=1)[lambda df:df.block<val_block],\
sales_p.drop(drop_cols,axis=1)[lambda df:df.block==val_block],\
sales_p.drop(drop_cols,axis=1)[lambda df:df.block==val_block+1],\
sales_p[sales_p.block<val_block][label_col],\
sales_p[sales_p.block==val_block][label_col],\
sales_p[sales_p.block==val_block+1][label_col],\

cat_cols=['shop_id','item_id','item_category_id']

print('base predict score %.4f\n'%mean_squared_error(clip_y(y_val),np.ones(y_val.shape[0])*0.5)**0.5)
gc.collect();

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6291948 entries, 0 to 6291947
Data columns (total 21 columns):
block                             6291948 non-null int32
item_cnt_month                    6291948 non-null float32
item_id                           6291948 non-null int32
shop_id                           6291948 non-null int32
shop_item_avg_cnt_month           6077748 non-null float32
item_price                        6077748 non-null float32
item_shop_avg_cnt_month           6077748 non-null float32
prev_item_cnt_month_1             4665824 non-null float32
prev_item_price_1                 4665824 non-null float32
prev_item_shop_avg_cnt_month_1    4665824 non-null float32
prev_shop_item_avg_cnt_month_1    4665824 non-null float32
prev_item_cnt_month_2             4220256 non-null float32
prev_item_price_2                 4220256 non-null float32
prev_item_shop_avg_cnt_month_2    4220256 non-null float32
prev_shop_item_avg_cnt_month_2    4220256 non-null float32
prev_ite

In [8]:
%%time
from lightgbm import LGBMRegressor


lgb=LGBMRegressor(n_jobs=8,
                  learning_rate=0.1,
                  
                 )
lgb.fit(X_train,y_train,
#         categorical_feature=cat_cols,
        eval_set=(X_val,y_val),
        early_stopping_rounds=1,
        eval_metric=lambda y_t,y_p:('error',mean_squared_error(clip_y(y_t),clip_y(y_p))**0.5,False),
        
       )
y_val_pred=lgb.predict(X_val)
val_score=mean_squared_error(clip_y(y_val),clip_y(y_val_pred))**0.5

y_pred=clip_y(lgb.predict(X_test))
results=sales_test[['ID']].copy()
results[label_col]=y_pred
results.sort_values('ID',inplace=True)

for fi in sorted(zip(X_train.columns,lgb.feature_importances_),key=lambda x:x[1],reverse=True):
    print(fi)
    

[1]	valid_0's l2: 27.3224	valid_0's error: 1.08529
Training until validation scores don't improve for 1 rounds.
[2]	valid_0's l2: 26.575	valid_0's error: 1.06044
[3]	valid_0's l2: 25.851	valid_0's error: 1.04122
[4]	valid_0's l2: 25.2804	valid_0's error: 1.02098
[5]	valid_0's l2: 24.7926	valid_0's error: 1.00896
[6]	valid_0's l2: 24.3514	valid_0's error: 1.00095
[7]	valid_0's l2: 23.822	valid_0's error: 0.995163
[8]	valid_0's l2: 23.4114	valid_0's error: 0.990774
[9]	valid_0's l2: 23.0768	valid_0's error: 0.987877
[10]	valid_0's l2: 22.8798	valid_0's error: 0.985598
[11]	valid_0's l2: 22.5772	valid_0's error: 0.98253
[12]	valid_0's l2: 22.3281	valid_0's error: 0.980715
[13]	valid_0's l2: 22.0275	valid_0's error: 0.979693
[14]	valid_0's l2: 21.875	valid_0's error: 0.978453
[15]	valid_0's l2: 21.7787	valid_0's error: 0.97839
[16]	valid_0's l2: 21.601	valid_0's error: 0.974522
[17]	valid_0's l2: 21.3076	valid_0's error: 0.974082
[18]	valid_0's l2: 21.1469	valid_0's error: 0.972996
[19]	va

In [9]:
pd.DataFrame(y_val).to_csv('ensemble/y_val.csv',index=False,header=False)
pd.DataFrame(y_val_pred).to_csv('ensemble/gbm_label_val.csv',index=False,header=False)
pd.DataFrame(y_pred).to_csv('ensemble/gbm_label_tst.csv',index=False,header=False)

In [6]:
report('use id labels\n'+str(lgb),X_train,results,val_score,hours_diff=12)

In [None]:
# from catboost import CatBoostRegressor

# cat=CatBoostRegressor(od_type='IncToDec',od_pval=1e-4,n_estimators=20)
# cat.fit(X_train,y_train,cat_features=[list(cat_cols).index(c) for c in cat_cols],use_best_model=True,eval_set=(X_val,clip_y(y_val)))
# print(mean_squared_error(clip_y(y_val),clip_y(cat.predict(X_val)))**0.5)

# y_pred=clip_y(cat.predict(X_test))
# sales_test['item_cnt_month']=y_pred
# sales_test[['ID','item_cnt_month']].to_csv('output/cat.csv',index=False)
# sorted(zip(X_train.columns,cat.feature_importances_),key=lambda x:x[1],reverse=True)