In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error as MSE
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
# 訓練用データ
train_path = './competition/' + 'train' + '.csv'
# テストデータ
test_path = './competition/' + 'test' + '.csv'
# 提出用データ
submit_path = './competition/' + 'submit_sample' + '.csv'

In [3]:
# 訓練用データの読み込み
train_df = pd.read_csv(train_path)

In [4]:
le = LabelEncoder()

In [5]:
# 欠損値の補完
train_df['kcal'] = train_df['kcal'].fillna(method = 'ffill')
train_df = train_df.fillna({'payday': 0.0, 'remarks':'特になし', 'event' : '特になし'})

In [6]:
## 天気のラベル化
le = le.fit(train_df['weather'])
train_df['weather_label'] = le.transform(train_df['weather'])
train_df['weather_label'] = train_df['weather_label'].astype('category')

In [7]:
## メインメニューのラベル化
le_2 = le.fit(train_df['name'])
train_df['name_label'] = le_2.transform(train_df['name'])
train_df['name_label'] = train_df['name_label'].astype('category')

In [8]:
## 特記事項に関する特徴量エンジニアリング
train_df['remarks_b']  = train_df['remarks']
train_df.loc[train_df.remarks != '特になし','remarks_b']= 1
train_df.loc[train_df.remarks == '特になし','remarks_b']= 0
train_df['remarks_b'] = train_df['remarks_b'].astype('category')

In [9]:
## 説明変数
X = train_df.drop(['dateid','y','name','remarks','weather'],axis = 1)
## 目的変数
y = train_df['y'].values

In [10]:
## 週データのダミー変数化(ワンホット)
X = pd.get_dummies(X,columns=['week'])
##13時開始お弁当持ち込み可の社内イベント'のダミー変数化(ワンホット)
X = pd.get_dummies(X,columns=['event'])

In [11]:
X = X.values

In [12]:
X

array([[  1., 377.,   0., ...,   0.,   0.,   1.],
       [  0., 380.,   0., ...,   0.,   0.,   1.],
       [  1., 390.,   0., ...,   0.,   0.,   1.],
       ...,
       [  0., 463.,   0., ...,   0.,   0.,   1.],
       [  1., 420.,   0., ...,   0.,   0.,   1.],
       [  1., 435.,   0., ...,   0.,   0.,   1.]])

In [13]:
y

array([125.,  99., 100.,  90., 107.,  96.,  88.,  87., 128.,  92.,  82.,
        57.,  57.,  56., 109.,  78.,  63.,  56.,  80.,  54.,  97.,  64.,
        47.,  88.,  59.,  58.,  73.,  70.,  64.,  58., 126.,  70.,  71.,
       104.,  54.,  65.,  77.,  79.,  74.,  64., 119.,  74.,  59.,  47.,
        64.,  93.,  74.,  56.,  50.,  72., 121.,  76.,  63.,  66.,  56.,
        57.,  72.,  61.,  64.,  49.,  74.,  65.,  54.,  50.,  53.,  63.,
        82.,  56.,  54.,  48., 124.,  65.,  51.,  53.,  53.,  55.,  53.,
        56.,  62.,  83.,  65.,  52.,  58.,  60.,  38.,  75.,  54.,  58.,
        63., 129.,  45.,  40.,  56.,  53.,  56.,  54., 100.,  55.,  68.,
        62.,  56.,  39.,  65.,  68.,  53.,  54.,  43.,  68.,  63.,  54.,
        53., 115.,  56.,  49.,  46.,  45.,  29.,  59.,  50.,  45.,  56.,
        40.,  53.,  41.,  39.,  56.,  61.,  44.,  40., 114.,  61.,  64.,
        39.,  43.,  56.,  63.,  66.,  46.,  71.,  53.,  56.,  50.,  42.,
        32.,  56.,  54.,  42., 124.,  57.,  53.,  5

### 時系列分割交差検証

In [55]:
from sklearn.model_selection import TimeSeriesSplit

In [113]:
## 何分割するか
tscv = TimeSeriesSplit(n_splits = 10)
## モデルを保存するリストの作成
model_list = []


for fold,(train, test) in enumerate(tscv.split(train_df)):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    model = RandomForestRegressor(random_state = 30)
    ## 学習
    model.fit(X_train,y_train)
    # 予測
    pred = model.predict(X_test)
    ## 検証データに対する誤差を求める
    score = np.sqrt(MSE(y_test,pred))
    print(f'fold{fold} RMSE: {score}')
    #モデルを保存
    model_list.append(model)

fold0 RMSE: 14.813114909880815
fold1 RMSE: 20.754034708139688
fold2 RMSE: 15.571967013407992
fold3 RMSE: 36.246398119905194
fold4 RMSE: 11.348950612281296
fold5 RMSE: 21.955851080444745
fold6 RMSE: 24.75516902251595
fold7 RMSE: 9.329173596841256
fold8 RMSE: 8.901802251978717
fold9 RMSE: 11.996288870591052


### テストデータの読み込み

In [73]:
test_df = pd.read_csv(test_path)

In [74]:
test_df['kcal'] = test_df['kcal'].fillna(method = 'ffill')
test_df = test_df.fillna({'payday': 0.0, 'remarks':'特になし', 'event' : '特になし'})

In [75]:
le_2 = le.fit(test_df['weather'])
test_df['weather_label'] = le_2.transform(test_df['weather'])
test_df['weather_label'] = test_df['weather_label'].astype('category')

In [76]:
le_1 = le.fit(test_df['name'])
test_df['name_label'] = le_1.transform(test_df['name'])
test_df['name_label'] = test_df['name_label'].astype('category')

In [77]:
test_df['remarks_b']  = test_df['remarks']
test_df.loc[test_df.remarks != '特になし','remarks_b']='1'
test_df.loc[test_df.remarks == '特になし','remarks_b']='0'
test_df['remarks_b'] = test_df['remarks_b'].astype('category')

In [78]:
### いらないものを消去
test_df = test_df.drop(['dateid','name','remarks','weather'],axis = 1)

In [79]:
## 週データのダミー変数化(ワンホット)
test_df = pd.get_dummies(test_df,columns=['week'])
##13時開始お弁当持ち込み可の社内イベント'のダミー変数化(ワンホット)
test_df = pd.get_dummies(test_df,columns=['event'])

In [80]:
test_df = test_df.values

In [81]:
test_df

array([[0, 428.0, 0.0, ..., 0, 0, 1],
       [0, 420.0, 0.0, ..., 0, 0, 1],
       [0, 456.0, 0.0, ..., 0, 0, 1],
       ...,
       [0, 505.0, 0.0, ..., 0, 0, 1],
       [1, 485.0, 0.0, ..., 0, 0, 1],
       [0, 508.0, 0.0, ..., 0, 0, 1]], dtype=object)

In [116]:
## 結果を辞書に保存
solution = {}
## 各モデルで予測
for i, model in enumerate(model_list):
    test_pred = model.predict(test_df)
    solution[str(i)+"_model"] = test_pred

solution = pd.DataFrame(solution)
solution = solution.drop(['0_model','1_model','2_model','3_model', '5_model','6_model','9_model'],axis = 1)
solution

Unnamed: 0,4_model,7_model,8_model
0,71.96,70.37,71.64
1,86.41,87.66,83.57
2,84.28,72.15,73.44
3,65.46,69.85,62.82
4,112.75,117.01,119.64
...,...,...,...
139,56.48,59.54,62.92
140,48.80,50.72,49.33
141,66.10,64.59,64.65
142,58.47,59.67,60.78


In [117]:
solution_mean = solution.mean(axis=1).values

In [118]:
solution_mean.shape

(144,)

In [121]:
submit = pd.read_csv(submit_path,header=None)
submit[1] = solution_mean
submit.to_csv("submit22.csv", index=False, header=False)

In [122]:
solution_mean

array([ 71.32333333,  85.88      ,  76.62333333,  66.04333333,
       116.46666667,  63.32666667,  53.88      ,  56.25333333,
        87.53      ,  56.38666667,  49.52333333, 117.65333333,
        65.34      ,  56.26666667,  57.91      ,  63.89333333,
        59.65666667,  58.14      , 117.34333333,  59.94333333,
        61.33      ,  58.23333333,  57.69      ,  44.71      ,
        59.36      ,  56.43666667,  61.73      ,  56.94333333,
       114.87666667,  58.48333333,  58.80333333,  63.01      ,
        54.46      ,  59.73      ,  57.06      ,  58.64      ,
        56.17      , 114.95666667,  60.64333333,  58.74666667,
        58.46333333,  55.30666667,  56.51333333,  58.44333333,
        57.22333333,  56.28666667,  57.41666667, 113.64333333,
        60.41666667,  63.06      ,  52.92666667,  48.85666667,
        59.19      ,  59.75666667, 117.51666667,  61.65      ,
        58.88666667,  57.85333333,  43.83333333,  65.19      ,
        58.72333333,  63.83666667,  53.13333333, 115.16

### LightGBMで検証 →いまいち

In [101]:
import lightgbm as lgb

In [109]:
## 何分割するか
tscv = TimeSeriesSplit(n_splits = 5)
## モデルを保存するリストの作成
model_list = []


for fold,(train, test) in enumerate(tscv.split(train_df)):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    
    # 学習用
    lgb_train = lgb.Dataset(X_train, y_train,
                            free_raw_data=False)
    # 検証用
    lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train,
                           free_raw_data=False)
    
    # パラメータを設定
    params = {'boosting_type': 'gbdt',        # 勾配ブースティング
              'objective': 'regression',      # 目的関数：回帰
              'metric': 'rmse',               # 分類モデルの性能を測る指標
              'learning_rate': 0.1 }          # 学習率（初期値0.1）
    
    # 学習
    model = lgb.train(params,                            # 上記で設定したパラメータ
                      lgb_train,                         # 使用するデータセット
                      num_boost_round=10000,              # 学習の回数
                      valid_sets= lgb_test,  # モデル検証のデータセット
                      early_stopping_rounds=20,          # アーリーストッピング
                      verbose_eval=0)                    # 学習の経過の非表示
    
    # 予測
    pred = model.predict(X_test)
    ## 検証データに対する誤差を求める
    score = np.sqrt(MSE(y_test,pred))
    print(f'fold{fold} RMSE: {score}')
    
    #モデルを保存
    model_list.append(model)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 57, number of used features: 6
[LightGBM] [Info] Start training from score 76.929825
fold0 RMSE: 20.68730690840719
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 154
[LightGBM] [Info] Number of data points in the train set: 113, number of used features: 11
[LightGBM] [Info] Start training from score 69.389381
fold1 RMSE: 24.532409840645073
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 200
[LightGBM] [Info] Number of data points in the train set: 169, number of used features: 12
[LightGBM] [Info] Start training from score 63.857988
fold2 RMSE: 17.377306088980486
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info]



In [108]:
## 結果を辞書に保存
solution = {}
## 各モデルで予測
for i, model in enumerate(model_list):
    test_pred = model.predict(test_df)
    solution[str(i)+"_model"] = test_pred

solution = pd.DataFrame(solution)
solution

Unnamed: 0,0_model,1_model,2_model,3_model,4_model
0,69.929004,69.642750,64.939862,67.219024,76.643309
1,75.514397,69.642750,64.621189,63.463855,67.039925
2,75.514397,69.195442,64.816921,62.330235,62.725882
3,69.929004,69.195442,65.909572,61.386107,67.452638
4,75.112006,69.195442,78.593622,92.209503,115.607502
...,...,...,...,...,...
139,75.112006,69.642750,57.677624,59.618003,64.041242
140,87.727146,71.300442,57.677624,62.017127,54.443403
141,69.526614,69.642750,61.020730,61.422597,59.975752
142,69.526614,69.195442,59.054820,62.077914,58.592059
