# Setup

## Config

In [2]:
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error as mape

class Config:
    def __init__(self):
        self.NB = '003'
        self.path_train_data = '../datas/train.csv'
        self.path_test_data = '../datas/test.csv'
        self.path_submission = f'../submissions/submission{self.NB}.csv'
        self.result_pathname = f'../results/{self.NB}'
        self.data_pathname = f'../datas/{self.NB}'
        self.modified_data_pathname = f'../modified_datas/{self.NB}'
        self.submit_col = ['id', 'num_sold']
        self.target_col = 'num_sold'
        self.n_estimators = 100
        self.early_stopping_rounds = 150
        self.cv_folds = 5
        self.ramdom_state = 1234
        self.settings = {
            'ratio_cols': ['store_sold_ratio', 'product_sold_ratio', 'country_sold_ratio'],
            'pred_cols': ['store', 'product', 'country'],
            'train_col': ['store', 'product', 'country', 'month','day','year','weekday','week','quarter']
        }

config = Config()

In [7]:
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

def onehot_encoding(df, cols):
    if isinstance(cols, str):
        cols = [cols]
    for col in cols:
        _enc = OneHotEncoder(sparse_output=False)
        _enc.fit(df[col].values.reshape(-1, 1))
        df_enc_arr = _enc.transform(df[col].values.reshape(-1, 1))
        df_enc = pd.DataFrame(df_enc_arr, columns=_enc.categories_)
        df_enc.index = df.index
        df = pd.concat([df, df_enc], axis=1)
        df.drop(col, axis=1, inplace=True)
        for c in df.columns:
            if isinstance(c, tuple):
                df.rename(columns={c: str(c[0])}, inplace=True)
        del df_enc_arr, df_enc
    return df

# def feature_engineering(train_df, test_df, pred_model, **kwargs):
#     """
#     Feature Engineering
#     """
train_df = pd.read_csv(config.path_train_data, parse_dates=True, index_col='date')
test_df = pd.read_csv(config.path_test_data, parse_dates=True, index_col='date')
pred_model = RandomForestRegressor(random_state=config.ramdom_state)
# Validation
if 'date' in train_df.columns:
    train_df['date'] = train_df.index
if 'date' in test_df.columns:
    test_df['date'] = test_df.index

def _fill_nan_with_10day_mean(_df_with_nan, _df_without_nan):
    if 'num_sold' not in _df_with_nan.columns:
        return _df_with_nan
    _10day_mean_df = pd.DataFrame(_df_without_nan['num_sold'].resample('10D').mean())
    for idx in _10day_mean_df.index:
        mask = (_df_with_nan.index >= idx) & (_df_with_nan.index <= idx + relativedelta(days=10))
        _df_with_nan.loc[mask, 'num_sold'] = _10day_mean_df.loc[idx, 'num_sold']
    return _df_with_nan

def _fill_sold_ratio(_df):
    _day_sum_df = _df.groupby([_df.index])['num_sold'].sum().rename('day_sum')
    _df['product_sold_ratio'] = _df.groupby([_df.index, 'product'])['num_sold'].transform('sum') / _day_sum_df
    _df['store_sold_ratio'] = _df.groupby([_df.index, 'store'])['num_sold'].transform('sum') / _day_sum_df
    _df['country_sold_ratio'] = _df.groupby([_df.index, 'country'])['num_sold'].transform('sum') / _day_sum_df
    del _day_sum_df
    return _df

def _prepare_dataframe(_df):
    _df = _df.copy()
    _df['month'] = _df.index.month
    _df['day'] = _df.index.day
    _df['year'] = _df.index.year
    _df['weekday'] = _df.index.weekday
    _df['week'] = _df.index.isocalendar().week
    _df['quarter'] = _df.index.quarter
    _df['is_weekend'] = _df['weekday'].apply(lambda x: 1 if x >= 5 else 0).astype(int)
    _df['is_weekday'] = _df['weekday'].apply(lambda x: 1 if x < 5 else 0).astype(int)
    _cols = _df.columns
    _cols = [col for col in _cols if isinstance(col, object)]

# All columns are numeric
    for _col in _df.columns:
        if _df[_col].dtype == 'bool':
            _df[_col] = _df[_col].astype(int)
        elif _df[_col].dtype == 'datetime64[ns]':
            _df[_col] = _df[_col].astype(int)
        if type(_col) == tuple:
            _df.rename(columns={_col: str(_col[0])}, inplace=True)
    if 'date' in _df.columns:
        _df.drop('date', axis=1, inplace=True)
    _df = _df.sort_index()
    return _df

has_nan = train_df.isna().any(axis=1)
_train_df_with_nan = train_df[has_nan]
_train_df_with_nan = _train_df_with_nan.copy()
_train_df_without_nan = train_df[~has_nan].copy()
_train_df_without_nan = _prepare_dataframe(_train_df_without_nan)
_train_df_filled = _fill_nan_with_10day_mean(_train_df_with_nan, _train_df_without_nan)
_train_df_filled = _prepare_dataframe(_train_df_filled)
train_df = pd.concat([_train_df_without_nan, _train_df_filled], axis=0)
train_df = train_df.sort_index()

del _train_df_with_nan, _train_df_without_nan, _train_df_filled

test_df = _prepare_dataframe(test_df)
# train_df = train_df.drop('date', axis=1)
# test_df = test_df.drop('date', axis=1)
_train_df1 = train_df.copy()
_test_df1 = test_df.copy()

for ratio, pred in zip(config.settings['ratio_cols'], config.settings['pred_cols']):
    display(test_df.head())
    _train_df2 = _train_df1.copy()
    _del_col = config.settings['pred_cols']
    _del_col.remove(pred)
    _train_df2 = _train_df2.drop(_del_col, axis=1)
    _day_sum_df = _train_df2.groupby([_train_df2.index])['num_sold'].sum().rename('day_sum')
    _train_df2[ratio] = _train_df2.groupby([_train_df2.index, pred])['num_sold'].transform('sum') / _day_sum_df
    # _train_df2 = onehot_encoding(_train_df2, pred)
    _label_enc = LabelEncoder()
    _train_df2[pred] = _label_enc.fit_transform(_train_df2[pred].unique())
    train_df[pred] = _label_enc.transform(train_df[pred].unique())
    _test_df2 = _test_df1.copy()
    _test_df2[pred] = _label_enc.fit_transform(_test_df2[pred].unique())
    test_df[pred] = _label_enc.transform(test_df[pred].unique())
    X_train = _train_df2.drop(ratio, axis=1).drop(['id', 'num_sold'], axis=1)
    y_train = _train_df2[ratio]
    _del_col = _del_col + ['id']
    X_test = _test_df2.drop(_del_col, axis=1)
    model = pred_model.fit(X_train, y_train)
    test_df[ratio] = model.predict(X_test)
    del _train_df2, _test_df1, X_train, y_train, X_test, _del_col, _day_sum_df

train_df = _fill_sold_ratio(train_df)
# train_df = onehot_encoding(train_df, config.settings['pred_cols'])
# test_df = onehot_encoding(test_df, config.settings['pred_cols'])
del _train_df1, _test_df1


In [3]:
train_df = pd.read_csv(config.path_train_data, parse_dates=True, index_col='date')
test_df = pd.read_csv(config.path_test_data, parse_dates=True, index_col='date')
model = RandomForestRegressor(random_state=config.ramdom_state)
train_df, test_df = feature_engineering(train_df, test_df, pred_model=model)
train_df.to_csv(f'{config.modified_data_pathname}_edited1_train.csv')
test_df.to_csv(f'{config.modified_data_pathname}_edited1_test.csv')

UnboundLocalError: cannot access local variable '_test_df1' where it is not associated with a value

# Test section

In [5]:
train_df1 = pd.read_csv(config.path_train_data, parse_dates=True, index_col='date')

train_df2 = train_df1.groupby(['store', 'product', 'country'])

display(train_df2.size())

test_df1 = pd.read_csv(config.path_test_data, parse_dates=True, index_col='date')

store              product             country  
Discount Stickers  Holographic Goose   Canada       2557
                                       Finland      2557
                                       Italy        2557
                                       Kenya        2557
                                       Norway       2557
                                                    ... 
Stickers for Less  Kerneler Dark Mode  Finland      2557
                                       Italy        2557
                                       Kenya        2557
                                       Norway       2557
                                       Singapore    2557
Length: 90, dtype: int64

In [31]:
import pandas as pd
import matplotlib.pyplot as plt

# NaNを削除
train_df3 = train_df1.dropna()

# 'Ker' のみを抽出（元のコードのバグ修正）
pros = train_df3['product'].unique()
pros = [pro for pro in pros if pro == 'Ker']

# 日ごとの合計
train_df3['date'] = pd.to_datetime(train_df3.index)
train_df3.reset_index(drop=True, inplace=True)
day_sum_df = train_df3.resample('D', on='date')['num_sold'].sum().rename('total_num_sold')

# 商品ごとの売上割合を計算
product_sum_df = train_df3.groupby(['date', 'product'])['num_sold'].sum().reset_index()
product_sum_df = product_sum_df.merge(day_sum_df, on='date')
product_sum_df['product_sold_ratio'] = product_sum_df['num_sold'] / product_sum_df['total_num_sold']

# store ごとの売上割合を計算
store_sum_df = train_df3.groupby(['date', 'store'])['num_sold'].sum().reset_index()
store_sum_df = store_sum_df.merge(day_sum_df, on='date')
store_sum_df['store_sold_ratio'] = store_sum_df['num_sold'] / store_sum_df['total_num_sold']

# country ごとの売上割合を計算
country_sum_df = train_df3.groupby(['date', 'country'])['num_sold'].sum().reset_index()
country_sum_df = country_sum_df.merge(day_sum_df, on='date')
country_sum_df['country_sold_ratio'] = country_sum_df['num_sold'] / country_sum_df['total_num_sold']

# train_df3 に product_ratio と store_ratio を追加
train_df3 = train_df3.merge(product_sum_df[['date', 'product', 'product_sold_ratio']], on=['date', 'product'], how='left')
train_df3 = train_df3.merge(store_sum_df[['date', 'store', 'store_sold_ratio']], on=['date', 'store'], how='left')
train_df3 = train_df3.merge(country_sum_df[['date', 'country', 'country_sold_ratio']], on=['date', 'country'], how='left')
train_df3.index = train_df1.dropna().index

# プロット用のデータフレーム作成
plot_data = product_sum_df.pivot(index='date', columns='product', values='product_sold_ratio')
plot_data.plot()
plt.savefig(f'{config.result_pathname}_product_ratio.png')

plot_data = store_sum_df.pivot(index='date', columns='store', values='store_sold_ratio')
plot_data.plot()
plt.savefig(f'{config.result_pathname}_store_ratio.png')

plt.close('all')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df3['date'] = pd.to_datetime(train_df3.index)


In [52]:
# Predict store_ratio, product_ratio and country_ratio

from sklearn.ensemble import RandomForestRegressor

test_df = pd.read_csv(config.path_test_data, parse_dates=True, index_col='date')

model = RandomForestRegressor(random_state=config.ramdom_state)

def preprocess_for_test(train_df, test_df, model):
    _train_df = train_df.copy()
    _test_df = test_df.copy()
    def _prepare_dataframe(_df):
        _df = _df.copy()
        _df['month'] = _df.index.month
        _df['day'] = _df.index.day
        _df['year'] = _df.index.year
        _df['weekday'] = _df.index.weekday
        _df['week'] = _df.index.isocalendar().week
        _df['quarter'] = _df.index.quarter
        _cols = _df.columns
        _cols = [col for col in _cols if isinstance(col, object)]
    
    # All columns are numeric
        for _col in _df.columns:
            if _df[_col].dtype == 'bool':
                _df[_col] = _df[_col].astype(int)
            elif _df[_col].dtype == 'datetime64[ns]':
                _df[_col] = _df[_col].astype(int)
            if type(_col) == tuple:
                _df.rename(columns={_col: str(_col[0])}, inplace=True)
        if 'date' in _df.columns:
            _df.drop('date', axis=1, inplace=True)
        _df = _df.sort_index()
        return _df

    def onehotencoding(_df, cols):
        if not isinstance(cols, list):
            cols = [cols]
        for col in cols:
            enc = OneHotEncoder(sparse_output=False)
            enc.fit(_df[col].values.reshape(-1, 1))
            df_enc_arr = enc.transform(_df[col].values.reshape(-1, 1))
            df_enc = pd.DataFrame(df_enc_arr, columns=enc.categories_)
            df_enc.index = _df.index
            _df = pd.concat([_df, df_enc], axis=1)
            _df.drop(col, axis=1, inplace=True)
            for c in _df.columns:
                if isinstance(c, tuple):
                    _df.rename(columns={c: str(c[0])}, inplace=True)
        return _df

    _train_df = _prepare_dataframe(_train_df)
    _test_df = _prepare_dataframe(_test_df)
    _test_df_ori = _test_df.copy()

    for ratio, pred in zip(config.settings['ratio_cols'], config.settings['pred_cols']):
        _train_df = _train_df.dropna()
        _train_df1 = _train_df[[ratio, pred]]
        _train_df = _train_df.drop(config.settings['ratio_cols'], axis=1)
        _train_df = _train_df.drop(config.settings['pred_cols'], axis=1)
        _train_df = pd.concat([_train_df, _train_df1], axis=1)
        _train_df = onehotencoding(_train_df, pred)
        _test_df = _test_df_ori.dropna()
        _test_df1 = _test_df[pred]
        _test_df = _test_df.drop(config.settings['pred_cols'], axis=1)
        _test_df = pd.concat([_test_df, _test_df1], axis=1)
        _test_df = onehotencoding(_test_df, pred)
        X_train = _train_df.drop(ratio, axis=1).drop(['id', 'num_sold'], axis=1)
        y_train = _train_df[ratio]
        X_test = _test_df.drop(['id'], axis=1)
        display(X_train.head())
        display(X_test.head())
        model.fit(X_train, y_train)
        _test_df[ratio] = model.predict(X_test)
        del _train_df, _train_df1, _test_df, _test_df1, X_train, y_train, X_test
    return _train_df, _test_df

train_df, test_df = preprocess_for_test(train_df, test_df, model)

display(test_df.head())

Unnamed: 0_level_0,month,day,year,weekday,week,quarter,Discount Stickers,Premium Sticker Mart,Stickers for Less
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-01,1,1,2010,4,53,1,1.0,0.0,0.0
2010-01-01,1,1,2010,4,53,1,1.0,0.0,0.0
2010-01-01,1,1,2010,4,53,1,1.0,0.0,0.0
2010-01-01,1,1,2010,4,53,1,1.0,0.0,0.0
2010-01-01,1,1,2010,4,53,1,0.0,0.0,1.0


Unnamed: 0_level_0,month,day,year,weekday,week,quarter,Discount Stickers,Premium Sticker Mart,Stickers for Less
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0


Unnamed: 0_level_0,month,day,year,weekday,week,quarter,Holographic Goose,Kaggle,Kaggle Tiers,Kerneler,Kerneler Dark Mode
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-01-01,1,1,2010,4,53,1,0.0,1.0,0.0,0.0,0.0
2010-01-01,1,1,2010,4,53,1,0.0,0.0,1.0,0.0,0.0
2010-01-01,1,1,2010,4,53,1,0.0,0.0,0.0,1.0,0.0
2010-01-01,1,1,2010,4,53,1,0.0,0.0,0.0,0.0,1.0
2010-01-01,1,1,2010,4,53,1,1.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,month,day,year,weekday,week,quarter,Holographic Goose,Kaggle,Kaggle Tiers,Kerneler,Kerneler Dark Mode
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,0.0,1.0,0.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,0.0,0.0,1.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,0.0,0.0,0.0,1.0,0.0
2017-01-01,1,1,2017,6,52,1,0.0,0.0,0.0,0.0,1.0


Unnamed: 0_level_0,month,day,year,weekday,week,quarter,Canada,Finland,Italy,Kenya,Norway,Singapore
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-01,1,1,2010,4,53,1,1.0,0.0,0.0,0.0,0.0,0.0
2010-01-01,1,1,2010,4,53,1,1.0,0.0,0.0,0.0,0.0,0.0
2010-01-01,1,1,2010,4,53,1,1.0,0.0,0.0,0.0,0.0,0.0
2010-01-01,1,1,2010,4,53,1,1.0,0.0,0.0,0.0,0.0,0.0
2010-01-01,1,1,2010,4,53,1,1.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,month,day,year,weekday,week,quarter,Canada,Finland,Italy,Kenya,Norway,Singapore
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0,0.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0,0.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0,0.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0,0.0,0.0,0.0
2017-01-01,1,1,2017,6,52,1,1.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,id,country,store,product,month,day,year,weekday,week,quarter,store_sold_ratio,product_sold_ratio,country_sold_ratio
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2017-01-01,230130,Canada,Discount Stickers,Holographic Goose,1,1,2017,6,52,1,0.183155,0.05123,0.167262
2017-01-01,230131,Canada,Discount Stickers,Kaggle,1,1,2017,6,52,1,0.183155,0.332222,0.167262
2017-01-01,230132,Canada,Discount Stickers,Kaggle Tiers,1,1,2017,6,52,1,0.183155,0.294314,0.167262
2017-01-01,230133,Canada,Discount Stickers,Kerneler,1,1,2017,6,52,1,0.183155,0.146557,0.167262
2017-01-01,230134,Canada,Discount Stickers,Kerneler Dark Mode,1,1,2017,6,52,1,0.183155,0.178788,0.167262


In [53]:
test_df.to_csv(f'{config.modified_data_pathname}_edited1_test.csv')