In [13]:
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from itertools import product
import gc
import optuna

In [14]:
def read_data(path: str, files: list):
    dataframes = []
    for file in files:
        dataframes.append(pd.read_csv(path + file))
    return dataframes


path = '../input/competitive-data-science-predict-future-sales/'
files = ['sales_train.csv', 'items.csv', 'shops.csv', 'item_categories.csv', 'test.csv']
sales_train, items, shops, item_categories, test = read_data(path, files)
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [15]:
def remove_outliers(df: pd.DataFrame, max_price: int, max_cnt: int):  
    df = df[df['item_price'] > 0]
    df = df[df['item_price'] < max_price]
    df = df[df['item_cnt_day'] > 0]
    df = df[df['item_cnt_day'] < max_cnt]
    return df


sales_train = remove_outliers(sales_train, 50000, 1000)

In [16]:
def remove_shop_duplicates(df_train: pd.DataFrame, df_test: pd.DataFrame, shop_dups: dict):
    for shop1, shop2 in shop_dups.items():
        df_train.loc[df_train['shop_id'] == shop1, 'shop_id'] = shop2
        df_test.loc[df_test['shop_id'] == shop1, 'shop_id'] = shop2
        

shop_dups = {0: 57, 1: 58, 10: 11, 39: 40}
remove_shop_duplicates(sales_train, test, shop_dups)

In [17]:
def create_city_feature(shops_df: pd.DataFrame, corrections: dict):
    shops_df['city'] = shops_df['shop_name'].str.extract(r'(\w+\.*\w*)')
    for city1, city2 in corrections.items():        
        shops_df.loc[shops_df.city == city1, 'city'] = city2
    shops_df['city'] = LabelEncoder().fit_transform(shops['city'])
    
    
corrections = {'Выездная': 'Выездная торговля', 'РостовНаДону': 'Ростов-На-Дону',
               'Сергиев': 'Сергиев Посад', 'Цифровой': 'Интернет'}
create_city_feature(shops, corrections)
shops.head()

Unnamed: 0,shop_name,shop_id,city
0,"!Якутск Орджоникидзе, 56 фран",0,28
1,"!Якутск ТЦ ""Центральный"" фран",1,28
2,"Адыгея ТЦ ""Мега""",2,0
3,"Балашиха ТРК ""Октябрь-Киномир""",3,1
4,"Волжский ТЦ ""Волга Молл""",4,2


In [19]:
def create_train(month_range: list, all_shops: list, all_items: list):
    train = []
    for month in month_range:
        train.append(np.array(list(product([month], all_shops, all_items))))
    train = pd.DataFrame(np.vstack(train), columns=['date_block_num', 'shop_id', 'item_id'])
    return train


train = create_train(sales_train.date_block_num.unique(), sales_train.shop_id.unique(),
                     sales_train.item_id.unique())
train

Unnamed: 0,date_block_num,shop_id,item_id
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564
...,...,...,...
41511003,33,36,8428
41511004,33,36,7903
41511005,33,36,7610
41511006,33,36,7635


In [20]:
def aggregate_on_month_level(train_df: pd.DataFrame, sales_train_df: pd.DataFrame, test_df: pd.DataFrame): 
    group = sales_train_df.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': 'sum', 'item_price': 'mean'}).reset_index()
    group = group.rename(columns={'item_cnt_day': 'item_cnt_month', 'item_price': 'item_price_mean'})
    #all_data = train_df.merge(group, on=['date_block_num', 'shop_id', 'item_id'], how='left')
    all_data = group
    
    test = test_df.merge(group[group['date_block_num'] == 33], on=['shop_id', 'item_id'], how='left')
    test = test.fillna(0)    
    test['date_block_num'] = 34

    all_data = pd.concat([all_data, test.drop(columns='ID')], ignore_index=True, keys=['date_block_num', 'shop_id', 'item_id'])
    all_data = all_data.fillna(0)
    all_data['item_cnt_month'] = all_data['item_cnt_month'].clip(0,20)
    return all_data


all_data = aggregate_on_month_level(train, sales_train, test)

In [22]:
def add_lag_features(lag_months: list, month_range: range):
    lags = pd.DataFrame()
    for month in month_range:
        cur_month = all_data[all_data.date_block_num == month]
        for lag in lag_months:
            cur_lag = all_data[all_data.date_block_num == month - lag]
            cur_lag = cur_lag.rename({'item_cnt_month': 'item_cnt_last_{}'.format(lag),
                                      'item_price_mean': 'item_price_last_{}'.format(lag)}, axis=1)

            cur_item_lag = all_data[all_data.date_block_num == month - lag]
            cur_item_lag = cur_item_lag.groupby(['item_id'], as_index=False).agg({'item_cnt_month': 'sum', 'item_price_mean': 'mean'})
            cur_item_lag = cur_item_lag.rename({'item_cnt_month': 'item_cnt_last_{}_item'.format(lag),
                                                'item_price_mean': 'item_price_last_{}_item'.format(lag)},
                                                axis=1)

            cur_month = cur_month.merge(cur_lag.drop(columns='date_block_num'), how='left',
                                        on=['shop_id', 'item_id'])
            cur_month = cur_month.merge(cur_item_lag,
                                        how='left', on=['item_id'])
        lags = pd.concat([lags, cur_month], ignore_index=True)
        
    lags = lags.fillna(0)
    return lags

lag_months = [1, 2, 3, 12]
month_range = range(12, all_data.date_block_num.nunique())
all_data = add_lag_features(lag_months, month_range)

In [23]:
items['first_sale_date'] = sales_train.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']
items['first_sale_date'] = items['first_sale_date'].fillna(34)

all_data = all_data.merge(shops[['shop_id', 'city']], on='shop_id', how='left')
all_data = all_data.merge(items[['item_id', 'item_category_id', 'first_sale_date']], on='item_id', how='left')
all_data = all_data.merge(item_categories.drop('item_category_name', axis=1), on='item_category_id', how='left')

all_data['new_item'] = all_data['first_sale_date'] == all_data['date_block_num']
all_data = all_data.drop('first_sale_date', axis=1)

all_data.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price_mean,item_cnt_last_1,item_price_last_1,item_cnt_last_1_item,item_price_last_1_item,item_cnt_last_2,...,item_price_last_3,item_cnt_last_3_item,item_price_last_3_item,item_cnt_last_12,item_price_last_12,item_cnt_last_12_item,item_price_last_12_item,city,item_category_id,new_item
0,12,2,32,1.0,119.0,0.0,0.0,90.0,146.372581,0.0,...,0.0,58.0,194.102564,0.0,0.0,242.0,335.042318,0,40,False
1,12,2,33,1.0,199.0,1.0,199.0,42.0,196.304348,2.0,...,0.0,33.0,194.758333,1.0,499.0,61.0,488.866667,0,37,False
2,12,2,99,1.0,149.0,0.0,0.0,24.0,123.132292,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,37,False
3,12,2,482,2.0,3300.0,1.0,3300.0,89.0,3207.011494,2.0,...,3300.0,131.0,3220.974026,1.0,3300.0,119.0,3051.536883,0,73,False
4,12,2,485,1.0,300.0,1.0,300.0,111.0,294.5,0.0,...,300.0,172.0,295.111111,0.0,0.0,0.0,0.0,0,73,False


In [24]:
del shops, items, item_categories, train
gc.collect()

46

In [25]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.1f}% compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

all_data = downcast(all_data)
gc.collect()

66.8% compressed


0

In [27]:
class TimeSeriesCycle:
    def __init__(self,
                 data: pd.DataFrame,
                 models_with_params: dict,
                 metric: object,
                 start_border: int,
                 train_border: int,
                 valid_border: int,
                 test_border: int,
                 cv_step: int,
                 optuna_trials: int
                ):
        self.data = data
        self.models_with_params = models_with_params
        self.metric = metric
        self.start_border = start_border
        self.train_border = train_border
        self.valid_border = valid_border
        self.test_border = test_border    
        self.cv_step = cv_step
        self.optuna_trials = optuna_trials
        self.best_estimators = []
        
   
    def evaluate_model(self, model, params):
        def objective(trial, X, y):
            model.set_params(**params(trial))
            scores = []
            for i in range(self.start_border + self.cv_step, self.train_border + 1, self.cv_step):
                X_train = self.data[self.data['date_block_num'] < i].drop(columns=['item_cnt_month'])
                y_train = self.data[self.data['date_block_num'] < i]['item_cnt_month']
                X_valid = self.data[self.data['date_block_num'] == i].drop(columns=['item_cnt_month'])
                y_valid = self.data[self.data['date_block_num'] == i]['item_cnt_month']

                model.fit(X_train, y_train)
                preds = model.predict(X_valid).clip(0, 20)
                scores.append(self.metric(y_valid, preds))
            return np.mean(scores)
        
        
        start = time.time()
        study_name = 'study'
        study = optuna.create_study(study_name=study_name)
        study.optimize(lambda trial: objective(trial,
                                               self.data.drop(columns=['item_cnt_month']),
                                               self.data['item_cnt_month']
                                              ), 
                       n_trials=self.optuna_trials, show_progress_bar=True, gc_after_trial=True)
        end = time.time()
        
        print('Best score: {}'.format(study.best_value))
        print('Best params: {}'.format(study.best_params))
        print('Taken time: {}'.format(int(end - start)))
        
        self.best_estimators.append(study.best_trial)
    
    
    def compare_models(self):
        for model, params in self.models_with_params.items():
            self.evaluate_model(model, params)
            
        scores = []
        for estimator in self.best_estimators:
            preds = estimator.predict(self.data[self.data['date_block_num'] == 
                                          self.valid_border].drop(colums=['item_cnt_month'])
                                          ).clip(0, 20)
            score = self.metric(preds, self.data[self.data['date_block_num'] == 
                                                    self.valid_border]['item_cnt_month'])
            print('Validation set score = ' + str(score))
            scores.append(score)
            
        return self.best_estimators[np.argmin(scores)]
    
    def get_predictions(self):
        start = time.time()
        
        model = self.compare_models()
        X = self.data[self.data['date_block_num'] <= self.valid_border].drop(colums=['item_cnt_month'])
        y = self.data[self.data['date_block_num'] <= self.valid_border]['item_cnt_month']
        model.fit(X, y)
        X_test = self.data[self.data['date_block_num'] == self.test_border].drop(columns=['item_cnt_day'])
        y_test = model.predict(X_test).clip(0, 20)
        
        end = time.time()
        print('Took ' + str(int(end - start)) + ' seconds to get final predictions')
        return y_test

In [28]:
def rmse_metric(y_valid, y_pred):
    return mean_squared_error(y_valid, y_pred, squared=False)

forecaster = xgb.XGBRegressor()

def xgb_params(trial):
    params = {
              'n_estimators': trial.suggest_int('n_estimators', 0, 500),
              'max_depth': trial.suggest_int('max_depth', 3, 5),
              'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 6),
              'reg_lambda': trial.suggest_uniform('reg_lambda', 0, 2),
              'min_child_weight': trial.suggest_int('min_child_weight', 0, 5),
              'gamma': trial.suggest_uniform('gamma', 0, 4),
              'learning_rate': trial.suggest_loguniform('learning_rate', 0.05, 0.5),
              'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 0.9),
              'subsample': trial.suggest_uniform('subsample', 0.4, 0.9),
              'nthread': -1
              }
    return params

tsc = TimeSeriesCycle(data=all_data,
                      models_with_params={forecaster: xgb_params},
                      metric=rmse_metric,
                      start_border=12,
                      train_border=32,
                      valid_border=33,
                      test_border=34,
                      cv_step=5,
                      optuna_trials=30
                     )
#tsc.compare_models()

In [None]:
y_test = tsc.get_predictions()
submission = pd.DataFrame({
    "ID": np.arange(y_test.shape[0]),
    "item_cnt_month": y_test
})
submission.to_csv('s.csv', index=False)

[32m[I 2022-10-12 19:38:14,163][0m A new study created in memory with name: study[0m
  self._init_valid()


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2022-10-12 19:45:03,557][0m Trial 0 finished with value: 1.8396263122558594 and parameters: {'n_estimators': 346, 'max_depth': 5, 'reg_alpha': 1.4868335574438198, 'reg_lambda': 0.08051103487258726, 'min_child_weight': 3, 'gamma': 0.3865180354271529, 'learning_rate': 0.20833112368459897, 'colsample_bytree': 0.5884468405076111, 'subsample': 0.7311488478162994}. Best is trial 0 with value: 1.8396263122558594.[0m
[32m[I 2022-10-12 19:48:15,984][0m Trial 1 finished with value: 1.9012296199798584 and parameters: {'n_estimators': 248, 'max_depth': 3, 'reg_alpha': 3.7655213542636856, 'reg_lambda': 0.03157298882704418, 'min_child_weight': 2, 'gamma': 1.2560587174774254, 'learning_rate': 0.1522088421432004, 'colsample_bytree': 0.6019421528240181, 'subsample': 0.5044836036687674}. Best is trial 0 with value: 1.8396263122558594.[0m
[32m[I 2022-10-12 19:51:38,246][0m Trial 2 finished with value: 1.9197133779525757 and parameters: {'n_estimators': 237, 'max_depth': 3, 'reg_alpha': 4.0