In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error as mape
from dateutil.relativedelta import relativedelta

### Const

In [3]:
# Const
NB = '002'
PATH_TRAIN_DATA = '../datas/train.csv'
PATH_TEST_DATA = '../datas/test.csv'
PATH_SUBMISSION = f'../submissions/submission_{NB}.csv'
SUBMIT_COLS = ['id', 'num_sold']
TARGET = 'num_sold'

# Process Section


In [4]:
def feature_engineering(df, **kwargs):
    """
    Feature Engineering
    """

    # Validation
    if 'date' not in df.columns:
        df['date'] = df.index
        # raise ValueError(f'date column is not in the dataframe. It has {df.columns}')
    def _fill_params(_prodf):
        _prodf['date'] = pd.to_datetime(_prodf['date'])
        _prodf['month'] = _prodf['date'].dt.month
        _prodf['day'] = _prodf['date'].dt.day
        _prodf['year'] = _prodf['date'].dt.year
        _prodf['weekday'] = _prodf['date'].dt.weekday
        _prodf['week'] = _prodf['date'].dt.isocalendar().week
        _prodf['quarter'] = _prodf['date'].dt.quarter
        _prodf['is_month_start'] = _prodf['date'].dt.is_month_start
        _prodf['is_month_end'] = _prodf['date'].dt.is_month_end
        _prodf['is_quarter_start'] = _prodf['date'].dt.is_quarter_start
        _prodf['is_quarter_end'] = _prodf['date'].dt.is_quarter_end
        _prodf['is_year_start'] = _prodf['date'].dt.is_year_start
        _prodf['is_year_end'] = _prodf['date'].dt.is_year_end
        #True:1, False:0 TODO: Apply one hot encoding
        _prodf['is_month_start'] = _prodf['is_month_start'].astype(int)
        _prodf['is_month_end'] = _prodf['is_month_end'].astype(int)
        _prodf['is_quarter_start'] = _prodf['is_quarter_start'].astype(int)
        _prodf['is_quarter_end'] = _prodf['is_quarter_end'].astype(int)
        _prodf['is_year_start'] = _prodf['is_year_start'].astype(int)
        _prodf['is_year_end'] = _prodf['is_year_end'].astype(int)
        _prodf['is_weekend'] = _prodf['weekday'].apply(lambda x: 1 if x >= 5 else 0)
        _prodf['is_weekday'] = _prodf['weekday'].apply(lambda x: 1 if x < 5 else 0)
        return _prodf
    
    def _fill_nan_with_10day_mean(_df_with_nan, _df_without_nan):
        if 'num_sold' not in _df_with_nan.columns:
            return _df_with_nan
        _10day_mean_df = pd.DataFrame(_df_without_nan['num_sold'].resample('10D').mean())
        for idx in _10day_mean_df.index:
            mask = (_df_with_nan.index >= idx) & (_df_with_nan.index <= idx + relativedelta(days=10))
            _df_with_nan.loc[mask, 'num_sold'] = _10day_mean_df.loc[idx, 'num_sold']
        return _df_with_nan
    has_nan = df.isna().any(axis=1)
    _df_with_nan = df[has_nan]
    _df_with_nan = _df_with_nan.copy()
    _df_without_nan = df[~has_nan].copy()
    _df_without_nan = _fill_params(_df_without_nan)
    _df_filled = _fill_nan_with_10day_mean(_df_with_nan, _df_without_nan)
    _df_filled = _fill_params(_df_filled)
    _df = pd.concat([_df_without_nan, _df_filled], axis=0)
    _df = _df.sort_index()

    # Product sold ratio in each store

    # label encoding
    for _col in ['store', 'product', 'country']:
        _enc = OneHotEncoder(sparse_output=False)
        _enc.fit(_df[_col].values.reshape(-1, 1))
        _df_enc_arr = _enc.transform(_df[_col].values.reshape(-1, 1))
        _df_enc = pd.DataFrame(_df_enc_arr, columns=_enc.categories_)
        _df_enc.index = _df.index
        _df = pd.concat([_df, _df_enc], axis=1)
        _df.drop(_col, axis=1, inplace=True)
    
    # All columns are numeric
    for _col in _df.columns:
        if _df[_col].dtype == 'object':
            _df[_col] = _df[_col].astype(float)
        elif _df[_col].dtype == 'bool':
            _df[_col] = _df[_col].astype(int)
        elif _df[_col].dtype == 'datetime64[ns]':
            _df[_col] = _df[_col].astype(int)
        if type(_col) == tuple:
            _df.rename(columns={_col: str(_col[0])}, inplace=True)
    return _df


train_df = pd.read_csv(PATH_TRAIN_DATA, parse_dates=True, index_col='date')
train_df = feature_engineering(train_df)
display(train_df.head())

Unnamed: 0_level_0,id,num_sold,date,month,day,year,weekday,week,quarter,is_month_start,...,Kaggle,Kaggle Tiers,Kerneler,Kerneler Dark Mode,Canada,Finland,Italy,Kenya,Norway,Singapore
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01,1,973.0,1262304000000000000,1,1,2010,4,53,1,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2010-01-01,67,3195.0,1262304000000000000,1,1,2010,4,53,1,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2010-01-01,66,3369.0,1262304000000000000,1,1,2010,4,53,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2010-01-01,65,579.0,1262304000000000000,1,1,2010,4,53,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2010-01-01,64,911.0,1262304000000000000,1,1,2010,4,53,1,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
# Test train split
from sklearn.model_selection import train_test_split

X = train_df.drop(TARGET, axis=1)
y = train_df[TARGET]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
mape_score = mape(y_val, y_pred)
print(f'MAPE: {mape_score}')


MAPE: 0.25795944265635856


In [10]:
clf = RandomForestRegressor(random_state=1234)
clf.fit(train_df.drop(TARGET, axis=1), train_df[TARGET])

test_df = pd.read_csv(PATH_TEST_DATA, parse_dates=True, index_col='date')
test_df = feature_engineering(test_df)

pred = clf.predict(test_df)
test_df['num_sold'] = pred

submit_df = test_df[SUBMIT_COLS].copy()
submit_df.to_csv(PATH_SUBMISSION, index=False)

# Test section

In [14]:
print(train_df.columns)
print(train_df.isna().sum())

Index(['id', 'num_sold', 'date', 'month', 'day', 'year', 'weekday', 'week',
       'quarter', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'is_weekend',
       'is_weekday', 'Discount Stickers', 'Premium Sticker Mart',
       'Stickers for Less', 'Holographic Goose', 'Kaggle', 'Kaggle Tiers',
       'Kerneler', 'Kerneler Dark Mode', 'Canada', 'Finland', 'Italy', 'Kenya',
       'Norway', 'Singapore'],
      dtype='object')
id                      0
num_sold                0
date                    0
month                   0
day                     0
year                    0
weekday                 0
week                    0
quarter                 0
is_month_start          0
is_month_end            0
is_quarter_start        0
is_quarter_end          0
is_year_start           0
is_year_end             0
is_weekend              0
is_weekday              0
Discount Stickers       0
Premium Sticker Mart    0
Stickers for 

In [None]:
train_df1 = pd.read_csv(PATH_TRAIN_DATA, parse_dates=True, index_col='date')
product_df = train_df1.groupby('product').agg({'num_sold': 'sum'}).reset_index()
train_df1 = feature_engineering(train_df1) #TODO procuctで分ける前にgroupbyを行ってから、ratioを求める。その後、productでエンコーディングする
display(train_df1.head())
display(product_df)

Unnamed: 0_level_0,id,num_sold,date,month,day,year,weekday,week,quarter,is_month_start,...,Kaggle,Kaggle Tiers,Kerneler,Kerneler Dark Mode,Canada,Finland,Italy,Kenya,Norway,Singapore
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01,1,973.0,1262304000000000000,1,1,2010,4,53,1,1,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2010-01-01,67,3195.0,1262304000000000000,1,1,2010,4,53,1,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2010-01-01,66,3369.0,1262304000000000000,1,1,2010,4,53,1,1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2010-01-01,65,579.0,1262304000000000000,1,1,2010,4,53,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2010-01-01,64,911.0,1262304000000000000,1,1,2010,4,53,1,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Unnamed: 0,product,num_sold
0,Holographic Goose,7350368.0
1,Kaggle,56721733.0
2,Kaggle Tiers,46870306.0
3,Kerneler,25596840.0
4,Kerneler Dark Mode,29964209.0
