In [1]:
# Параметры для модели (рандом)
# ~0.71179 (лучший вариант)
params = {    
    'min_child_weight': 0.03454472573214212,
    'feature_fraction': 0.3797454081646243,
    'bagging_fraction': 0.4181193142567742,
    'min_data_in_leaf': 106,
    'objective': 'regression',
    'max_depth': -1,
    'learning_rate': 0.006883242363721497,
    "boosting_type": "gbdt",
    "bagging_seed": 11,
    "metric": 'rmse',
    "verbosity": -1,
    'reg_alpha': 0.3899927210061127,
    'reg_lambda': 0.6485237330340494,
    'random_state': 47
}
# Начальный варинт ~45
#     params = {
#         'metric': 'rmse',
#         'objective': 'poisson',
#         'seed': 200,
#         'force_row_wise' : True,
#         'learning_rate' : 0.1,
#         'lambda': 0.1,
#         'num_leaves': 45,
#         'sub_row' : 0.7,
#         'bagging_freq' : 1,
#         'colsample_bytree': 0.80
#     }
# def lgbm_val(X_train, X_val, y_train, y_val):
#     # create dataset
#     train = lgb.Dataset(X_train, label = y_train)
#     valid = lgb.Dataset(X_val, label = y_val)
    
#     # fitting
#     lgbm = lgb.train(params, 
#                     train, 
#                     num_boost_round = 300, 
#                     valid_sets = [valid], 
#                     early_stopping_rounds = 20,
#                     verbose_eval = 5)
    
#     # plot feature importance by gain
#     lgb.plot_importance(lgbm, importance_type="gain", precision=0, figsize=(6, 10));
    
#     # MSE and R2 prediction
#     pred_train = lgbm.predict(X_train)
#     pred_val = lgbm.predict(X_val)
    
#     return pred_train, pred_val, lgbm

In [2]:
# Импорт библиотек
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import preprocessing, metrics
import gc
import os


In [3]:
# Файлы для импорта
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/m5-forecasting-accuracy/sample_submission.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv
/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv


In [4]:
# Считывание данных
calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
sales_train_validation = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')


In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df




In [6]:
# Оптимизация памяти
calendar = reduce_mem_usage(calendar)
sell_prices = reduce_mem_usage(sell_prices)

Mem. usage decreased to  0.12 Mb (41.9% reduction)
Mem. usage decreased to 130.48 Mb (37.5% reduction)


In [7]:
# Подготовка данных
def melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 55000000, merge = False):
    
    # Поворачиваем в удобный вид
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    # Оптимизируем
    sales_trian_validation = reduce_mem_usage(sales_train_validation)
    
    # Разделяем тестовые данные
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # Меняем названия колонок
    test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
    test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']
    
    # Получаем список уникальных продуктов
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
    # Мержим данные по дням со списком продуктов
    test1 = test1.merge(product, how = 'left', on = 'id')
    test2 = test2.merge(product, how = 'left', on = 'id')
    
    # Снова поворачиваем
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_validation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_validation, test1, test2], axis = 0)
    
    del sales_train_validation, test1, test2
    

    data = data.loc[nrows:]
    data = data[data['part'] != 'test2']
    

    # Мержим календарь по дням (как бы не захлебнуться тут)
    data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
    data.drop('d', inplace = True, axis = 1)
    # Мержим данные по ценам/магазинам
    data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
    print('Final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))

    
    return data
        

In [8]:
data = melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 45000000, merge = True)
data.sample(5)

Mem. usage decreased to 3226.27 Mb (9.4% reduction)
Final dataset to train has 14181090 rows and 23 columns


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part,date,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
347943,HOBBIES_1_171_CA_4_validation,HOBBIES_1_171,HOBBIES_1,HOBBIES,CA_4,CA,d_1488,1,train,2015-02-24,...,2,2015,,,,,0,0,0,2.880859
3043447,HOBBIES_1_363_WI_1_validation,HOBBIES_1_363,HOBBIES_1,HOBBIES,WI_1,WI,d_1576,1,train,2015-05-23,...,5,2015,,,,,0,0,0,2.880859
5534707,HOUSEHOLD_1_017_TX_1_validation,HOUSEHOLD_1_017,HOUSEHOLD_1,HOUSEHOLD,TX_1,TX,d_1658,0,train,2015-08-13,...,8,2015,,,,,0,1,0,4.941406
13552499,FOODS_3_328_CA_4_validation,FOODS_3_328,FOODS_3,FOODS,CA_4,CA,d_1921,0,test1,2016-05-02,...,5,2016,,,,,1,0,1,3.880859
13302364,FOODS_3_211_CA_2_validation,FOODS_3_211,FOODS_3,FOODS,CA_2,CA,d_1913,3,train,2016-04-24,...,4,2016,,,,,0,0,0,4.578125


In [9]:
# Преобразуем для обучения
def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
    
    encoder = preprocessing.LabelEncoder()
    data['id_encode'] = encoder.fit_transform(data['id'])
    
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'day', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    
    return data



def simple_fe(data):
    
    data['lag_t28'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    data['lag_t29'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(29))
    data['lag_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(30))
    data['rolling_mean_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    data['rolling_sum_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).sum())
    data['rolling_std_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    data['rolling_min_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).min())
    data['rolling_max_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).max())
    data['rolling_mean_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    data['rolling_mean_t90'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    data['rolling_mean_t180'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    
    return data

# Предикт на 28 дней, запись в csv
def predict(test, submission):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    final.to_csv('submission.csv', index = False)

features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'day', 'wday', 'month', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 
            'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_sum_t7', 'rolling_std_t7', 'rolling_min_t7', 'rolling_max_t7', 'rolling_mean_t30', 'rolling_mean_t90', 
            'rolling_mean_t180']





In [10]:
data['date'] = pd.to_datetime(data['date'])
data = transform(data)
data = simple_fe(data)
data.sample(5)
    

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,part,date,...,lag_t29,lag_t30,rolling_mean_t7,rolling_sum_t7,rolling_std_t7,rolling_min_t7,rolling_max_t7,rolling_mean_t30,rolling_mean_t90,rolling_mean_t180
9231081,HOUSEHOLD_2_472_TX_3_validation,3004,6,2,6,1,303,1,train,2015-12-12,...,0.0,0.0,0.142857,1.0,0.377964,0.0,1.0,0.433333,0.311111,0.205556
9849216,HOUSEHOLD_1_195_WI_3_validation,2192,5,2,9,2,323,1,train,2016-01-01,...,0.0,0.0,0.142857,1.0,0.377964,0.0,1.0,0.1,0.177778,0.15
1324722,HOUSEHOLD_2_169_CA_4_validation,2702,6,2,3,0,44,0,train,2015-03-28,...,0.0,1.0,0.428571,3.0,0.534522,0.0,1.0,,,
9479811,HOBBIES_1_287_WI_2_validation,1716,3,1,8,2,311,0,train,2015-12-20,...,0.0,0.0,0.285714,2.0,0.48795,0.0,1.0,0.233333,0.4,0.377778
10809294,HOBBIES_1_407_TX_1_validation,1835,3,1,4,1,355,0,train,2016-02-02,...,0.0,2.0,0.428571,3.0,0.786796,0.0,2.0,0.4,0.622222,0.622222


In [24]:
def run_lgb(data):
    
    # Выбираем последние 28 дней
    x_train = data[data['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_val = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    y_val = x_val['demand']
    test = data[(data['date'] > '2016-04-24')]

    params = {
        'min_child_weight': 0.03,
        'feature_fraction': 0.37,
        'bagging_fraction': 0.41,
        'min_data_in_leaf': 100,
        'objective': 'regression',
        'max_depth': -1,
        'learning_rate': 0.06,
        "boosting_type": "gbdt",
        "bagging_seed": 11,
        "metric": 'rmse',
        "verbosity": -1,
        'reg_alpha': 0.2,
        'reg_lambda': 0.5,
        'random_state': 47
    }
    
    train_set = lgb.Dataset(x_train[features], y_train)
    val_set = lgb.Dataset(x_val[features], y_val)

    model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 200, valid_sets = [train_set, val_set], verbose_eval = 100)
    val_pred = model.predict(x_val[features])
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
    print(f'Our val rmse score is {val_score}')
    y_pred = model.predict(test[features])
    test['demand'] = y_pred
    return test

In [25]:
test = run_lgb(data)
predict(test, submission)

Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 2.40745	valid_1's rmse: 2.18422
[200]	training's rmse: 2.35534	valid_1's rmse: 2.16595
[300]	training's rmse: 2.33049	valid_1's rmse: 2.17209
Early stopping, best iteration is:
[205]	training's rmse: 2.35389	valid_1's rmse: 2.1657
Our val rmse score is 2.1657041289541863
