In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error as MSE
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
from sklearn.model_selection import train_test_split

In [34]:
import seaborn as sns; sns.set()
import lightgbm as lgb

In [35]:
###各データのパス###
# 訓練用データ
train_path = './competition/' + 'train' + '.csv'
# テストデータ
test_path = './competition/' + 'test' + '.csv'
# 提出用データ
submit_path = './competition/' + 'submit_sample' + '.csv'

In [36]:
le = LabelEncoder()

In [37]:
# 訓練用データの読み込み
train_df = pd.read_csv(train_path)

In [38]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337 entries, 0 to 336
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   dateid         337 non-null    object 
 1   y              337 non-null    float64
 2   week           337 non-null    object 
 3   soldout        337 non-null    int64  
 4   name           337 non-null    object 
 5   kcal           304 non-null    float64
 6   remarks        49 non-null     object 
 7   event          28 non-null     object 
 8   payday         17 non-null     float64
 9   weather        337 non-null    object 
 10  precipitation  337 non-null    float64
 11  temperature    337 non-null    float64
dtypes: float64(5), int64(1), object(6)
memory usage: 31.7+ KB


In [39]:
### 特徴量重要度を棒グラフでプロットする関数
def plot_feature_importance(df):
    n_features = len(df)
    df_plot = df.sort_values('importance')
    f_importance_plot = df_plot['importance'].values
    plt.barth(range(n_features), f_importance_plot, align='center')
    cols_plot = df_plot['feature'].values
    plt.yticks(np.arange(n_features), cols_plot)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

In [40]:
train_df = train_df.fillna({'remarks':'特になし', 'event' : '特になし'})

In [41]:
## 天気をカテゴリ変数とする
le = le.fit(train_df['weather'])
train_df['weather'] = le.transform(train_df['weather'])
train_df['weather'] = train_df['weather'].astype('category')

In [42]:
type(train_df['weather'])

pandas.core.series.Series

In [43]:
## メインメニュー名をカテゴリ化
le_2 = le.fit(train_df['name'])
train_df['name'] = le_2.transform(train_df['name'])
train_df['name'] = train_df['name'].astype('category')

In [44]:
train_df.loc[train_df.remarks != '特になし','remarks']= 1
train_df.loc[train_df.remarks == '特になし','remarks']= 0

In [45]:
train_df['remarks'] = train_df['remarks'].astype('category')

In [46]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337 entries, 0 to 336
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   dateid         337 non-null    object  
 1   y              337 non-null    float64 
 2   week           337 non-null    object  
 3   soldout        337 non-null    int64   
 4   name           337 non-null    category
 5   kcal           304 non-null    float64 
 6   remarks        337 non-null    category
 7   event          337 non-null    object  
 8   payday         17 non-null     float64 
 9   weather        337 non-null    category
 10  precipitation  337 non-null    float64 
 11  temperature    337 non-null    float64 
dtypes: category(3), float64(5), int64(1), object(3)
memory usage: 37.6+ KB


In [47]:
X = train_df.drop(['dateid','y'],axis = 1)
## 週データのダミー変数化(ワンホット)
X = pd.get_dummies(X,columns=['week'])
##13時開始お弁当持ち込み可の社内イベント'のダミー変数化(ワンホット)
X = pd.get_dummies(X,columns=['event'])
#X_feature = X

In [48]:
X_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337 entries, 0 to 336
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   soldout              337 non-null    int64   
 1   name                 337 non-null    category
 2   kcal                 304 non-null    float64 
 3   remarks              337 non-null    category
 4   payday               17 non-null     float64 
 5   weather              337 non-null    category
 6   precipitation        337 non-null    float64 
 7   temperature          337 non-null    float64 
 8   week_Fri             337 non-null    uint8   
 9   week_Mon             337 non-null    uint8   
 10  week_Thu             337 non-null    uint8   
 11  week_Tue             337 non-null    uint8   
 12  week_Wed             337 non-null    uint8   
 13  event_キャリアアップ支援セミナー  337 non-null    uint8   
 14  event_ママの会           337 non-null    uint8   
 15  event_特になし           33

In [49]:
#X = X_feature.values
#y = train_df['y'].values

In [50]:
## 時系列順になるようにtrain_test_split →　学習データとテストデータに分割
X_train,X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, shuffle=False,random_state = 1030)

### 学習データを用いて時系列分割検証

In [52]:
## 学習用データでの予測値を保存するデータフレームの初期化
df_train_preds = pd.DataFrame({'y_train': y_train})

In [53]:
## 検証用データでの予測値を保存するデータフレームの初期化
df_eval_preds = pd.DataFrame({'y_eval' : [],
                              'y_eval_pred' : [] })

In [54]:
df_test_preds = pd.DataFrame({'y_test': y_test})

In [55]:
df_test_preds.reset_index(inplace = True, drop=True)

In [56]:
## RMSEを保存するデータフレームの初期化
df_RMSE = pd.DataFrame({'train':[], 'eval': [], 'test':[]})

In [57]:
### ラウンド数の初期化
round_no = 0

In [58]:
## 学習データの数だけの数列
row_no_list = list(range(len(y_train)))

In [59]:
from sklearn.model_selection import TimeSeriesSplit

In [60]:
tscv = TimeSeriesSplit(n_splits = 5)

In [69]:
for train_index, eval_index in tscv.split(X):
    print("TRAIN:",train_index, "TEST:", eval_index)

TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56] TEST: [ 57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74
  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92
  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110
 111 112]
TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112] TEST: [113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
 131 132 133 134 135 136 137 138 139 

In [None]:
### モデルのアンサンブル(保存した予測値を使用)
train_preds_ave = df_train_preds.iloc[:, 1:].mean(axis=1).to_numpy()
test_preds_ave = df_test_preds.iloc[:, 1:].mean(axis=1).to_numpy()

### RMSE を計算
train_preds_ave_RMSE_score = np.sqrt(MSE(y_train, train_preds_ave))
test_preds_ave_RMSE_score = np.sqrtMSE((y_test, test_preds_ave))
print('RMSE train: %.5f, test: %.5f'%(train_preds_ave_RMSE_score, test_preds_ave_RMSE_score))