In [103]:
#!pip install catboost

import pandas as pd
import numpy as np

import seaborn as sns

import lightgbm as lgb

import xgboost as xgb

#import optuna
#from optuna import Trial

from scipy.signal import periodogram
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

import catboost
from catboost import CatBoostRegressor, CatBoostClassifier
from catboost import Pool, CatBoost

import sklearn
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeClassifier, Lasso, ElasticNet, Lars
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, ShuffleSplit, train_test_split, KFold, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, OneHotEncoder, PowerTransformer, MaxAbsScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, roc_curve, mean_squared_error, mean_squared_log_error
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

import datetime
from datetime import datetime, timedelta, timezone
import matplotlib.pyplot as plt
from tqdm import tqdm

from statistics import mean
import math

import warnings
warnings.filterwarnings("ignore")

[0m

In [178]:
holidays = pd.read_csv('../input/dac22-invent-analytics-project/holidays.csv')
holidays['date'] = pd.to_datetime(holidays['date'])
product = pd.read_csv('../input/dac22-invent-analytics-project/product.csv').rename(columns={'id': 'product_id'})
sample = pd.read_csv('../input/dac22-invent-analytics-project/sample_submission.csv')
train = pd.read_csv('../input/dac22-invent-analytics-project/train.csv')
test = pd.read_csv('../input/dac22-invent-analytics-project/test.csv')
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
all_data = pd.concat((train, test), axis=0)
all_data = all_data.merge(product, on='product_id', how='left')
all_data = all_data[all_data['store_count'] != 0]

In [180]:
def get_time_till_next_season(x):
    year = x.year
    till_summer = (datetime(year, 6, 1) - x).days
    till_fall = (datetime(year, 9, 1) - x).days
    till_winter = (datetime(year, 12, 1) - x).days
    till_spring = (datetime(year, 3, 1) - x).days
    till_next_spring = (datetime(year+1, 3, 1) - x).days
    till_all = [till_summer, till_fall, till_winter, till_spring, till_next_spring]
    all_seasons = [x for x in till_all if x >= 0]
    return min(all_seasons)
    
all_data['time_till_next_season'] = all_data['date'].apply(get_time_till_next_season)

In [185]:
all_data['is_main_promo'] = all_data['promotion_type'].fillna('').str.contains('Main Promo').astype(int)
all_data['is_season_middle_promo'] = all_data['promotion_type'].fillna('').str.contains('Season Middle').astype(int)

all_data['is_daily'] = (all_data['life_style'].fillna('').str.contains('Daily')).astype(int)
all_data['is_dark'] = (all_data['life_style'].fillna('').str.contains('Dark')).astype(int)

all_data['is_narrow'] = (all_data['form_type'].fillna('').str.contains('Narrow')).astype(int)
all_data['is_normal'] = (all_data['form_type'].fillna('').str.contains('Normal')).astype(int)

In [186]:
pro_cat_cols = [col for col in product.columns if col != 'id']
train_cat_cols = ['product_id', 'season_type', 'promotion_type']
cat_cols = [ 'category_1', 'category_2', 'category_3', 'color_type', 'life_style'
           , 'fabric', 'weight_of_fabric', 'neck_style', 'form_type', 'sleeve_type', 'washing_style'
           , 'fabric_type', 'season_type', 'promotion_type', 'season', 'all_category', 'holiday_name']
to_drop_cols = ['id', 'date', 'sales_amount', 'product_id']
target = 'sales_amount'

In [187]:
all_data['discounted_price'] = all_data['price'] - all_data['price']*all_data['discount']
all_data['all_category'] = all_data['category_1'] + all_data['category_2'] + all_data['category_3']

def get_holiday(df):
    dates = pd.DataFrame(df['date'].unique()).rename(columns={0: 'date'})
    dates['is_holiday'] = 0
    dates['holiday_name'] = ''
    for i in tqdm(range(len(dates))):
        for j in range(len(holidays)):
            if (dates.at[i, 'date'] <= holidays.at[j, 'date']) and ((dates.at[i, 'date'] + timedelta(days=7)) > holidays.at[j, 'date']):
                dates.at[i, 'is_holiday'] = 1
                dates.at[i, 'holiday_name'] = holidays.at[j, 'holiday']
    df = df.merge(dates, on='date', how='left')
    return df
                
all_data = get_holiday(all_data)

100%|██████████| 242/242 [00:03<00:00, 71.19it/s]


In [188]:
grouped = all_data.groupby(['all_category', 'date']).mean()['price'].reset_index().rename(columns={'price': 'mean_cat_price'})
all_data = all_data.merge(grouped, on=['all_category', 'date'], how='left')
all_data['price_mean_diff'] = all_data['price'] - all_data['mean_cat_price']

In [190]:
def get_season(x):
    if (x >= 3) and (x < 6):
        return 'spring'
    if (x >= 6) and (x < 9):
        return 'summer'
    if (x >= 9) and (x < 12):
        return 'autumn'
    return 'winter'
 
def is_in_season(x):
    if x.season_type == 'Autumn-Winter':
        if (x.season == 'autumn') or (x.season == 'winter'):
            return 1
    if x.season_type == 'Summer-Spring':
        if (x.season == 'summer') or (x.season == 'autumn'):
            return 1
    return 0

all_data['month'] = all_data['date'].dt.month
all_data['season'] = all_data['month'].apply(get_season)
all_data['in_season'] = all_data.apply(is_in_season, axis=1)

all_data['season_type+category_3'] = all_data['season_type'] + all_data['category_3']
all_data['season+category_3'] = all_data['season'] + all_data['category_3']

all_data['season_type+color_type'] = all_data['season_type'] + all_data['color_type']
all_data['season+color_type'] = all_data['season'] + all_data['color_type']

In [191]:
def how_many_weeks_past(df):
    df['last_entry'] = df.groupby('product_id').shift(1)['date']
    df['next_entry'] = df.groupby('product_id').shift(-1)['date']
    df['weeks_since_last_entry'] = (df['date'] - df['last_entry']).dt.days / 7
    df['weeks_to_next_entry'] = (df['next_entry'] - df['date']).dt.days / 7
    df.drop(columns=['last_entry', 'next_entry'], inplace=True)
    
how_many_weeks_past(all_data)

In [192]:
all_data['last_is_here'] = (all_data['weeks_since_last_entry'] == 1)
all_data['next_is_here'] = (all_data['weeks_to_next_entry'] == 1)

#all_data['all_sold'] = (all_data['next_week_store_count'] > all_data['store_count']).astype(int)
#all_data['none_sold'] = (all_data['store_count'] > all_data['next_week_store_count']).astype(int)

In [193]:
def get_seri_no(df):
    df['seri_no'] = np.nan
    pro_u = df['product_id'].unique()
    for u in tqdm(pro_u):
        grouped = df[df['product_id'] == u].reset_index()
        s = 0
        for i in range(len(grouped)):
            idx = grouped.iat[i, 0]
            if(grouped.iat[i, -3] == 1):
                df.at[idx, 'seri_no'] = s
            else:
                s += 1
                df.at[idx, 'seri_no'] = s
                
get_seri_no(all_data)
all_data = all_data.merge(all_data.groupby('product_id').count()['id'].reset_index().rename(columns={'id': 'product_count'}), on='product_id', how='left')

100%|██████████| 7876/7876 [00:57<00:00, 136.92it/s]


In [194]:
def get_min_max_diff(df, cols):
    for col in cols:
        
        df = df.merge(df.groupby('product_id').max()[col].reset_index().rename(columns={col: 'max_'+col}), on='product_id', how='left')
        df = df.merge(df.groupby('product_id').min()[col].reset_index().rename(columns={col: 'min_'+col}), on='product_id', how='left')
        df = df.merge(df.groupby('product_id').mean()[col].reset_index().rename(columns={col: 'mean_'+col}), on='product_id', how='left')
        #print(df)
        df['min_'+col+'_diff'] = df[col] - df['min_'+col]
        df['max_'+col+'_diff'] = df['max_'+col] - df[col]
        df['mean_'+col+'_diff'] = df[col] - df['mean_'+col]
        
    return df

def get_seri_based(df, cols):
    for col in cols:
        
        df = df.merge(df.groupby(['product_id', 'seri_no']).max()[col].reset_index().rename(columns={col: 'seri_max_'+col}), on=['product_id', 'seri_no'], how='left')
        df = df.merge(df.groupby(['product_id', 'seri_no']).min()[col].reset_index().rename(columns={col: 'seri_min_'+col}), on=['product_id', 'seri_no'], how='left')
        df = df.merge(df.groupby(['product_id', 'seri_no']).mean()[col].reset_index().rename(columns={col: 'seri_mean_'+col}), on=['product_id', 'seri_no'], how='left')
        
        df['seri_min_'+col+'_diff'] = df[col] - df['seri_min_'+col]
        df['seri_max_'+col+'_diff'] = df['seri_max_'+col] - df[col]
        df['seri_mean_'+col+'_diff'] = df[col] - df['seri_mean_'+col]
        
    return df

all_data = get_min_max_diff(all_data, ['store_count', 'price'])
all_data = get_seri_based(all_data, ['store_count', 'price'])

In [195]:
def shift_n_weeks(df, cols, shifts):
    for col in cols:
        for n in shifts:
            df[col+'_shifted_'+str(n)] = df.groupby(['product_id', 'seri_no']).shift(n)[col]

shift_n_weeks(all_data, ['store_count'], [-5, -4, -3, -2, -1, 1, 2, 3, 4, 5])
shift_n_weeks(all_data, ['price'], [-1, 1])

all_data['diff_last_week_store_count'] = all_data['store_count'] - all_data['store_count_shifted_1']
all_data['diff_next_week_store_count'] = all_data['store_count_shifted_-1'] - all_data['store_count']

all_data['diff_last_week_price'] = all_data['price'] - all_data['price_shifted_1']
all_data['diff_next_week_price'] = all_data['price_shifted_-1'] - all_data['price']

all_data['sudden_inc'] = (all_data['diff_next_week_store_count'] >= 50).astype(int)

In [115]:
def rmsle_lgbm(labels, preds):
    score = np.sqrt(np.mean(np.power(np.log1p(labels) - np.log1p(np.clip(preds, 0, np.inf)), 2)))
    return 'rmsle', score, False

class CatRMSLE(object):
    # RMSLE for catboost
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        weight_sum = 1

        score = np.sqrt(np.mean(np.power(np.log1p(target) - np.log1p(np.clip(approx, 0, np.inf)), 2)))

        return score, weight_sum

In [None]:
all_data['store_sales_diff'] = all_data['store_count'] - all_data['sales_amount']
all_data['sales_store_div'] = all_data['sales_amount'] / all_data['store_count']

In [None]:
def get_cummean(df, to_groups, cols):
    for col in cols:
        for to_group in to_groups:
            df[to_group+'_'+col+'_cummean'] = df.groupby(to_group)[col].apply(lambda x: x.shift().expanding().mean())
    return df
        
all_data = get_cummean(all_data, to_groups=cat_cols, cols=['sales_amount', 'store_sales_diff', 'sales_store_div'])

In [None]:
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col])

In [None]:
# The lgbm kfold
train = all_data[all_data[target].notna()]
test = all_data[all_data[target].isna()]
scores = []
fold = 1
preds = pd.Series(np.zeros(len(test)))
kf = KFold(5, shuffle=True, random_state=27)
preds = pd.Series(np.zeros(len(test)))
for train_ind, val_ind in kf.split(train):
    tr = train.iloc[train_ind]
    val = train.iloc[val_ind]
    lgb_reg = lgb.LGBMRegressor(num_leaves=63, learning_rate=0.2, metric=[rmsle_lgbm, 'mae'],
                             n_estimators=2000, objective='mae', first_metric_only='true', random_state=42)
    lgb_reg.fit(tr.drop(columns=to_drop_cols), tr[target]
               ,eval_set=(val.drop(columns=to_drop_cols), val[target])
               ,eval_metric=rmsle_lgbm
               ,early_stopping_rounds=200
               ,verbose=100
              ,categorical_feature=cat_cols
                   )
    print(lgb.plot_importance(lgb_reg, height=0.2, figsize=(12,8), importance_type='gain'))
    preds += lgb_reg.predict(test.drop(columns=to_drop_cols))

preds /= 5

In [None]:
preds = np.clip(preds, 0, np.inf)
#preds = np.expm1(preds)
sample['sales_amount'] = preds
sample.to_csv('sub.csv', index=False)