In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from sklearn import preprocessing, metrics
import lightgbm as lgb
import gc
import random
import time

In [2]:
pd.set_option("max_columns", 200)
pd.set_option("max_rows", 200)

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def read_data():
    df_calendar = pd.read_csv(f'{data_path}/calendar.csv')
    df_calendar = reduce_mem_usage(df_calendar)
    df_train = pd.read_csv(f'{data_path}/sales_train_validation.csv')
    df_train = reduce_mem_usage(df_train)
    df_submission = pd.read_csv(f'{data_path}/sample_submission.csv')
    df_submission = reduce_mem_usage(df_submission)
    df_sell_prices = pd.read_csv(f'{data_path}/sell_prices.csv')
    df_sell_prices = reduce_mem_usage(df_sell_prices)
    return df_calendar, df_sell_prices, df_train, df_submission

def encode_categorical(df, cols, Encoder=None):
    
#     if type(Encoder) == type(None):
#         Encoder = ce.OrdinalEncoder
#     encoder = ce.BackwardDifferenceEncoder
#     encoder = ce.BaseNEncoder
#     encoder = ce.BinaryEncoder
#     encoder = ce.CatBoostEncoder
#     encoder = ce.HashingEncoder
#     encoder = ce.HelmertEncoder
#     encoder = ce.JamesSteinEncoder
#     encoder = ce.LeaveOneOutEncoder
#     encoder = ce.MEstimateEncoder
#     encoder = ce.OneHotEncoder
#     encoder = ce.OrdinalEncoder
#     encoder = ce.SumEncoder
#     encoder = ce.PolynomialEncoder
#     encoder = ce.TargetEncoder
#     encoder = ce.WOEEncoder
    for col in cols:
        le = LabelEncoder()
        df[col] = df[col].fillna('nan')
        encoded_values = le.fit_transform(df[col].values)#.values[:,0]
        df[col] = pd.Series(encoded_values, index=df.index)
    return df

def prepare_data(sales_train_val, submission, calendar, sell_prices):
    
    NUM_ITEMS = sales_train_val.shape[0]  # 30490
    DAYS_PRED = submission.shape[1] - 1  # 28
    nrows = 365 * 2 * NUM_ITEMS

    # sales_train_valからidの詳細部分(itemやdepartmentなどのid)を重複なく一意に取得しておく。(extract a detail of id columns)
    product = sales_train_val[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    d_name = [f'd_{i+1}' for i in range(1913)]
    sales_train_val_values = sales_train_val[d_name].values

    # calculate the start position(first non-zero demand observed date) for each item / 商品の最初の売上日
    # 1-1914のdayの数列のうち, 売上が存在しない日を一旦0にし、0を9999に置換。そのうえでminimum numberを計算
    tmp = np.tile(np.arange(1,1914),(sales_train_val_values.shape[0],1))
    df_tmp = ((sales_train_val_values>0) * tmp)
    start_no = np.min(np.where(df_tmp==0,9999,df_tmp),axis=1)-1

    flag = np.dot(np.diag(1/(start_no+1)) , tmp)<1
    sales_train_val_values = np.where(flag,np.nan,sales_train_val_values)
    sales_train_val[d_name] = sales_train_val_values
    sales_train_val = pd.melt(sales_train_val, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')

    sales_train_val = sales_train_val.iloc[-nrows:,:]
    sales_train_val = sales_train_val[~sales_train_val.demand.isnull()]

    # submission fileのidのvalidation部分と, ealuation部分の名前を取得
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]

    # submission fileのvalidation部分をtest1, ealuation部分をtest2として取得
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]

    # test1, test2の列名の"F_X"の箇所をd_XXX"の形式に変更
    test1.columns = ["id"] + [f"d_{d}" for d in range(1914, 1914 + DAYS_PRED)]
    test2.columns = ["id"] + [f"d_{d}" for d in range(1942, 1942 + DAYS_PRED)]

    # test2のidの'_evaluation'を置換
    #test1['id'] = test1['id'].str.replace('_validation','')
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')

    # idをキーにして, idの詳細部分をtest1, test2に結合する.
    test1 = test1.merge(product, how = 'left', on = 'id')
    test2 = test2.merge(product, how = 'left', on = 'id')

    # test1, test2をともにmelt処理する.（売上数量:demandは0）
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],  var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],  var_name = 'day', value_name = 'demand')

    # validation部分と, evaluation部分がわかるようにpartという列を作り、 test1,test2のラベルを付ける。
    sales_train_val['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'

    # sales_train_valとtest1, test2の縦結合.
    data = pd.concat([sales_train_val, test1, test2], axis = 0)

    # delete test2 for now(6/1以前は, validation部分のみ提出のため.)
    data = data[data['part'] != 'test2']

    #calendarの結合
    # drop some calendar features(不要な変数の削除:weekdayやwdayなどはdatetime変数から後ほど作成できる。)
    calendar.drop(['weekday', 'wday', 'month', 'year'],   inplace = True, axis = 1)
    # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)(dayとdをキーにdataに結合)
    data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
    data.drop(['d', 'day'], inplace = True, axis = 1)

    #sell priceの結合
    # get the sell price data (this feature should be very important)
    data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
    
    return data, product

def simple_fe(data):
    
    # demand features(過去の数量から変数生成)
    
    for diff in [0, 1, 2]:
        shift = DAYS_PRED + diff
        data[f"shift_t{shift}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(shift))
    '''
    for size in [7, 30, 60, 90, 180]:
        data[f"rolling_std_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).std())
    '''
    for size in [7, 30, 60, 90, 180]:
        data[f"rolling_mean_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).mean())
    '''
    data["rolling_skew_t30"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(30).skew())
    data["rolling_kurt_t30"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(30).kurt())
    '''
    # price features
    # priceの動きと特徴量化（価格の変化率、過去1年間の最大価格との比など）
    
    data["shift_price_t1"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.shift(1))
    data["price_change_t1"] = (data["shift_price_t1"] - data["sell_price"]) / (data["shift_price_t1"])
    data["rolling_price_max_t365"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.shift(1).rolling(365).max())
    data["price_change_t365"] = (data["rolling_price_max_t365"] - data["sell_price"]) / (data["rolling_price_max_t365"])
    data["rolling_price_std_t7"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.rolling(7).std())
    data["rolling_price_std_t30"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.rolling(30).std())
    
    # time features
    # 日付に関するデータ
    dt_col = "date"
    data[dt_col] = pd.to_datetime(data[dt_col])
    
    attrs = [
        "year",
        "quarter",
        "month",
        "week",
        "day",
        "dayofweek",
        "is_year_end",
        "is_year_start",
        "is_quarter_end",
        "is_quarter_start",
        "is_month_end",
        "is_month_start",
    ]

    for attr in attrs:
        dtype = np.int16 if attr == "year" else np.int8
        data[attr] = getattr(data[dt_col].dt, attr).astype(dtype)

    data["is_weekend"] = data["dayofweek"].isin([5, 6]).astype(np.int8)
    
    return data


def weight_calc(df_feat, product):
    
    # calculate the denominator of RMSSE, and calculate the weight base on sales amount

    sales_train_val = pd.read_csv(f'{data_path}/sales_train_validation.csv')

    d_name = ['d_' + str(i+1) for i in range(1913)]

    sales_train_val = weight_mat_csr * sales_train_val[d_name].values

    # calculate the start position(first non-zero demand observed date) for each item / 商品の最初の売上日
    # 1-1914のdayの数列のうち, 売上が存在しない日を一旦0にし、0を9999に置換。そのうえでminimum numberを計算
    df_tmp = ((sales_train_val>0) * np.tile(np.arange(1,1914),(weight_mat_csr.shape[0],1)))

    start_no = np.min(np.where(df_tmp==0,9999,df_tmp),axis=1)-1

    flag = np.dot(np.diag(1/(start_no+1)) , np.tile(np.arange(1,1914),(weight_mat_csr.shape[0],1)))<1

    sales_train_val = np.where(flag,np.nan,sales_train_val)

    # denominator of RMSSE / RMSSEの分母
    weight1 = np.nansum(np.diff(sales_train_val,axis=1)**2,axis=1)/(1913-start_no)

    # calculate the sales amount for each item/level
    df_tmp = df_feat[(df_feat['date'] > '2016-03-27') & (df_feat['date'] <= '2016-04-24')]
    df_tmp['amount'] = df_tmp['demand'] * df_tmp['sell_price']
    df_tmp =df_tmp.groupby(['id'])['amount'].apply(np.sum)
    df_tmp = df_tmp[product.id].values
    
    weight2 = weight_mat_csr * df_tmp 

    weight2 = weight2/np.sum(weight2)

    del sales_train_val
    gc.collect()
    
    return weight1, weight2



def wrmsse(preds, data):
    
    # this function is calculate for last 28 days to consider the non-zero demand period
    
    # actual obserbed values / 正解ラベル
    y_true = data.get_label()
    
    score = wrmsse_score(preds, y_true)
    
    return 'wrmsse', score, False

def wrmsse_score(preds, y_true):
    y_true = y_true[-(NUM_ITEMS * DAYS_PRED):]
    preds = preds[-(NUM_ITEMS * DAYS_PRED):]
    # number of columns
    num_col = DAYS_PRED
    
    # reshape data to original array((NUM_ITEMS*num_col,1)->(NUM_ITEMS, num_col) ) / 推論の結果が 1 次元の配列になっているので直す
    reshaped_preds = preds.reshape(num_col, NUM_ITEMS).T
    reshaped_true = y_true.reshape(num_col, NUM_ITEMS).T
    
          
    train = weight_mat_csr*np.c_[reshaped_preds, reshaped_true]
    
    score = np.sum(
                np.sqrt(
                    np.mean(
                        np.square(
                            train[:,:num_col] - train[:,num_col:])
                        ,axis=1) / weight1) * weight2)
    return score
    

def wrmsse_simple(preds, data):
    
    # actual obserbed values / 正解ラベル
    y_true = data.get_label()
    
    y_true = y_true[-(NUM_ITEMS * DAYS_PRED):]
    preds = preds[-(NUM_ITEMS * DAYS_PRED):]
    # number of columns
    num_col = DAYS_PRED
    
    # reshape data to original array((NUM_ITEMS*num_col,1)->(NUM_ITEMS, num_col) ) / 推論の結果が 1 次元の配列になっているので直す
    reshaped_preds = preds.reshape(num_col, NUM_ITEMS).T
    reshaped_true = y_true.reshape(num_col, NUM_ITEMS).T
          
    train = np.c_[reshaped_preds, reshaped_true]
    
    weight2_2 = weight2[:NUM_ITEMS]
    weight2_2 = weight2_2/np.sum(weight2_2)
    
    score = np.sum(
                np.sqrt(
                    np.mean(
                        np.square(
                            train[:,:num_col] - train[:,num_col:])
                        ,axis=1) /  weight1[:NUM_ITEMS])*weight2_2)
    
    return 'wrmsse', score, False

def weight_mat_csr_(product):
    weight_mat = np.c_[np.ones([NUM_ITEMS,1]).astype(np.int8), # level 1
                       pd.get_dummies(product.state_id.astype(str),drop_first=False).astype('int8').values,
                       pd.get_dummies(product.store_id.astype(str),drop_first=False).astype('int8').values,
                       pd.get_dummies(product.cat_id.astype(str),drop_first=False).astype('int8').values,
                       pd.get_dummies(product.dept_id.astype(str),drop_first=False).astype('int8').values,
                       pd.get_dummies(product.state_id.astype(str) + product.cat_id.astype(str),drop_first=False).astype('int8').values,
                       pd.get_dummies(product.state_id.astype(str) + product.dept_id.astype(str),drop_first=False).astype('int8').values,
                       pd.get_dummies(product.store_id.astype(str) + product.cat_id.astype(str),drop_first=False).astype('int8').values,
                       pd.get_dummies(product.store_id.astype(str) + product.dept_id.astype(str),drop_first=False).astype('int8').values,
                       pd.get_dummies(product.item_id.astype(str),drop_first=False).astype('int8').values,
                       pd.get_dummies(product.state_id.astype(str) + product.item_id.astype(str),drop_first=False).astype('int8').values,
                       np.identity(NUM_ITEMS).astype(np.int8) #item :level 12
                       ].T
    weight_mat_csr = csr_matrix(weight_mat)
    return weight_mat_csr

def submission_(test, submission, y_pred):
    test['demand'] = y_pred
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]
    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
#     final.to_csv('submission.csv', index = False)
    return final


def process(trial, df_feats, features=None, params=None):

    # going to evaluate with the last 28 days
    x_train = df_feats[df_feats['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_val = df_feats[(df_feats['date'] > '2016-03-27') & (df_feats['date'] <= '2016-04-24')]
    y_val = x_val['demand']
    test = df_feats[(df_feats['date'] > '2016-04-24')]

    if type(features) == type(None):
        features = x_train.columns.drop(['id','demand','part','date']).tolist()

    if type(params) == type(None):
        params = {
            'boosting_type': 'gbdt',
            'metric': 'custom',
            'objective': 'poisson',
            'n_jobs': -1,
            'seed': 236,
            'learning_rate': 0.1,
            'bagging_fraction': 0.75,
            'bagging_freq': 10, 
            'colsample_bytree': 0.75}
    
    train_set = lgb.Dataset(x_train[features], y_train)
    val_set = lgb.Dataset(x_val[features], y_val)

    # model estimation
    model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 50, valid_sets = [train_set, val_set], verbose_eval = 100, feval= wrmsse)

    trn_pred = model.predict(x_train[features])
    val_pred = model.predict(x_val[features])
    y_pred = model.predict(test[features])

    val_rmse_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
    val_wrmsse_score = wrmsse_score(val_pred, y_val.values)
    trn_rmse_score = np.sqrt(metrics.mean_squared_error(trn_pred, y_train))
    trn_wrmsse_score = wrmsse_score(trn_pred, y_train.values)

    df_feature_import = pd.DataFrame({'feature':model.feature_name(), 'importance':model.feature_importance()})
    df_feature_import.sort_values(by=['importance'], ascending=False, inplace=True)
    df_feature_import.reset_index(drop=True,inplace=True)

    trial.append({
        'params':params, 'val_rmse_score':val_rmse_score,
        'val_wrmsse_score':val_wrmsse_score, 'trn_rmse_score':trn_rmse_score,
        'trn_wrmsse_score':trn_wrmsse_score, 'y_pred':y_pred,'features':features,
        'feature_importance':df_feature_import,
    })
    
    return


In [4]:
data_path = '../input/m5-forecasting-accuracy'

In [5]:
calendar, sell_prices, sales_train_val, submission = read_data()
# 予測期間とitem数の定義 / number of items, and number of prediction period
NUM_ITEMS = sales_train_val.shape[0]  # 30490
DAYS_PRED = submission.shape[1] - 1  # 28

# calendar = encode_categorical(calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]).pipe(reduce_mem_usage)

for col in ['event_name_1', 'event_type_1','event_name_2','event_type_2']:
    calendar = pd.concat([calendar, pd.get_dummies(calendar[col],  prefix=''.join([t[0] for t in col.split('_')]))],axis=1)
    calendar.drop([col], axis=1, inplace=True)
calendar.columns = [''.join([c for c in col if c.isalnum() or c == '_']) for col in calendar.columns]

sales_train_val = encode_categorical(sales_train_val, ["item_id", "dept_id", "cat_id", "store_id", "state_id"]).pipe(reduce_mem_usage)
sell_prices = encode_categorical(sell_prices, ["item_id", "store_id"]).pipe(reduce_mem_usage)
df_data, product = prepare_data(sales_train_val, submission, calendar, sell_prices)

Mem. usage decreased to  0.12 Mb (41.9% reduction)
Mem. usage decreased to 95.00 Mb (78.7% reduction)
Mem. usage decreased to  2.09 Mb (84.5% reduction)
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Mem. usage decreased to 94.01 Mb (0.4% reduction)
Mem. usage decreased to 45.67 Mb (41.7% reduction)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [92]:
def fft_peak(series, top_no):
    fft_amp = np.abs(np.fft.fft(series))/series.shape[0]*2
    sorted_fft_amp_idx = np.argsort(fft_amp)
    return fft_amp[sorted_fft_amp_idx[-top_no]]

def fft_peak_freq(series, top_no):
    fft_amp = np.abs(np.fft.fft(series))/series.shape[0]*2
    sorted_fft_amp_idx = np.argsort(fft_amp)
    return sorted_fft_amp_idx[-top_no]

In [129]:
def simple_fe(data):
    data = data.copy()
    stopwatch = [time.time()]
    # demand features(過去の数量から変数生成)
    for diff in [0, 1, 2]:
        shift = DAYS_PRED + diff
        data[f"shift_t{shift}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(shift))
    stopwatch.append(time.time())
    print('lag',stopwatch[-1]-stopwatch[-2])

    for size in [7, 30, 60, 90, 180]:
        data[f"rolling_mean_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).mean())
        data[f"rolling_std_t{size}"] = data.groupby(["id"])["demand"].transform(lambda x: x.shift(DAYS_PRED).rolling(size).std())
    stopwatch.append(time.time())
    print('rolling', stopwatch[-1]-stopwatch[-2])

    for size in [7, 30, 60, 90, 180]:
        data[f"sum_latest{size}_demand"] = data.groupby(["id"])["demand"].transform(lambda x: np.sum(x.shift(DAYS_PRED).dropna()[-size:]))
    stopwatch.append(time.time())
    print('sum latest', stopwatch[-1]-stopwatch[-2])
    
    for top_no_ in range(1,4):
        data[f'top{top_no_}_amp']=data.groupby(["id"])["demand"].transform(lambda x: fft_peak(x.shift(DAYS_PRED).dropna(),top_no_))
    stopwatch.append(time.time())
    print('top peak', stopwatch[-1]-stopwatch[-2])
    for top_no_ in range(1,4):
        data[f'top{top_no_}_amp_freq']=data.groupby(["id"])["demand"].transform(lambda x: fft_peak_freq(x.shift(DAYS_PRED).dropna(),top_no_))
    stopwatch.append(time.time())
    print('top peak freq', stopwatch[-1]-stopwatch[-2])
    
    # price features
    # priceの動きと特徴量化（価格の変化率、過去1年間の最大価格との比など）
    data["shift_price_t1"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.shift(1))
    data["price_change_t1"] = (data["shift_price_t1"] - data["sell_price"]) / (data["shift_price_t1"])
    data["rolling_price_max_t365"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.shift(1).rolling(365).max())
    data["price_change_t365"] = (data["rolling_price_max_t365"] - data["sell_price"]) / (data["rolling_price_max_t365"])
    data["rolling_price_std_t7"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.rolling(7).std())
    data["rolling_price_std_t30"] = data.groupby(["id"])["sell_price"].transform(lambda x: x.rolling(30).std())
    stopwatch.append(time.time())
    print('price', stopwatch[-1]-stopwatch[-2])
    
    # time features
    # 日付に関するデータ
    dt_col = "date"
    data[dt_col] = pd.to_datetime(data[dt_col])
    attrs = [
        "year",
        "quarter",
        "month",
        "week",
        "day",
        "dayofweek",
        "is_year_end",
        "is_year_start",
        "is_quarter_end",
        "is_quarter_start",
        "is_month_end",
        "is_month_start",
    ]
    for attr in attrs:
        dtype = np.int16 if attr == "year" else np.int8
        data[attr] = getattr(data[dt_col].dt, attr).astype(dtype)
    data["is_weekend"] = data["dayofweek"].isin([5, 6]).astype(np.int8)
    stopwatch.append(time.time())
    print('datetime', stopwatch[-1]-stopwatch[-2])
    return data

df_feats = simple_fe(df_data)
df_feats = reduce_mem_usage(df_feats)
weight_mat_csr = weight_mat_csr_(product)
weight1, weight2 = weight_calc(df_feats, product)

lag 42.63405108451843
rolling 216.5279619693756
sum latest 214.61716890335083
top peak 122.67562246322632
top peak freq 123.13594770431519
price 73.8044273853302
datetime 14.61186146736145
Mem. usage decreased to 3466.55 Mb (48.2% reduction)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [130]:
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
#                     'verbose': -1,
                } 

In [131]:
topN_columns = ['year',
 'en1_Thanksgiving',
 'sell_price',
 'store_id',
 'snap_TX',
 'is_month_end',
 'price_change_t1',
 'en1_MemorialDay',
 'top1_amp',
 'rolling_mean_t180',
 'et1_National',
 'is_year_end',
 'is_month_start',
 'wm_yr_wk',
 'rolling_mean_t7',
 'rolling_mean_t60',
 'is_quarter_start',
 'et1_Religious',
 'et1_Sporting',
 'dept_id',
 'top3_amp_freq',
 'week',
 'sum_latest30_demand',
 'rolling_std_t90',
 'et1_Cultural',
 'en1_LaborDay',
 'price_change_t365',
 'quarter',
 'en1_IndependenceDay',
 'shift_t30',
 'en1_Christmas',
 'state_id',
 'snap_CA',
 'en1_PresidentsDay',
 'rolling_mean_t90',
 'snap_WI',
 'shift_price_t1',
 'rolling_price_std_t7',
 'rolling_price_std_t30',
 'is_quarter_end',
 'rolling_std_t60',
 'rolling_std_t7',
 'shift_t29',
 'en1_SuperBowl',
 'day',
 'top2_amp_freq',
 'item_id',
 'sum_latest60_demand',
 'top3_amp',
 'rolling_mean_t30',
 'rolling_std_t180',
 'top2_amp',
 'sum_latest90_demand',
 'sum_latest7_demand',
 'rolling_price_max_t365',
 'month',
 'shift_t28',
 'sum_latest180_demand',
 'rolling_std_t30',
 'is_weekend',
 'en1_Mothersday',
 'dayofweek',
 'cat_id']

In [132]:
process(trial, df_feats, features=topN_columns, params=lgb_params)



Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 2.3058	training's wrmsse: 0.669598	valid_1's rmse: 2.09734	valid_1's wrmsse: 0.659146
[200]	training's rmse: 2.22132	training's wrmsse: 0.557263	valid_1's rmse: 2.05108	valid_1's wrmsse: 0.556102
[300]	training's rmse: 2.18253	training's wrmsse: 0.498003	valid_1's rmse: 2.03846	valid_1's wrmsse: 0.530158
Early stopping, best iteration is:
[310]	training's rmse: 2.17986	training's wrmsse: 0.492491	valid_1's rmse: 2.03766	valid_1's wrmsse: 0.528663


In [133]:
df_trial = pd.DataFrame(trial)

In [134]:
df_trial[['val_rmse_score','val_wrmsse_score','trn_rmse_score','trn_wrmsse_score']]

Unnamed: 0,val_rmse_score,val_wrmsse_score,trn_rmse_score,trn_wrmsse_score
0,2.022183,0.521345,2.256584,0.526063
1,2.085802,0.540371,2.270014,0.616651
2,2.020039,0.542924,2.257233,0.512387
3,2.024083,0.514462,2.249112,0.521665
4,2.022183,0.521345,2.256584,0.526063
5,2.036266,0.521296,2.193072,0.494677
6,2.047,0.528371,2.18624,0.49724
7,2.03929,0.541267,2.180233,0.482441
8,2.040266,0.533232,2.173968,0.482171
9,2.040074,0.538082,2.177678,0.478772


In [136]:
sub = submission_(df_feats[(df_feats['date'] > '2016-04-24')], submission, df_trial.iloc[-1]['y_pred'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [137]:
sub.to_csv('submission.csv', index=False)