In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
###各データのパス###
# 訓練用データ
train_path = './competition/' + 'train' + '.csv'
# テストデータ
test_path = './competition/' + 'test' + '.csv'
# 提出用データ
submit_path = './competition/' + 'submit_sample' + '.csv'

In [3]:
# 訓練用データの読み込み
train_df = pd.read_csv(train_path)

In [4]:
print(train_df)

         dateid      y week  soldout           name   kcal   remarks event  \
0    2014-04-01  125.0  Tue        1          メンチカツ  377.0       NaN   NaN   
1    2014-04-02   99.0  Wed        0           肉じゃが  380.0       NaN   NaN   
2    2014-04-03  100.0  Thu        1             酢豚  390.0       NaN   NaN   
3    2014-04-04   90.0  Fri        1     和風ソースハンバーグ  350.0       NaN   NaN   
4    2014-04-07  107.0  Mon        0        青梗菜牛肉炒め  387.0       NaN   NaN   
..          ...    ...  ...      ...            ...    ...       ...   ...   
332  2015-08-21  137.0  Fri        0         チキンカレー    NaN  お楽しみメニュー   NaN   
333  2015-09-01   80.0  Tue        1     自家製手作りトンカツ  423.0    弁当種類増↓   NaN   
334  2015-09-02   74.0  Wed        0       チーズメンチカツ  463.0       NaN   NaN   
335  2015-09-03   66.0  Thu        1  鶏肉黒胡椒焼きおろしソース  420.0       NaN   NaN   
336  2015-09-04   60.0  Fri        1         汁なし担担麺  435.0       NaN   NaN   

     payday       weather  precipitation  temperature  
0      

### 欠損値処理

In [5]:
### 欠損値の確認(kcal:33，remarks:288, event:309, payday:320)→4つのcolumnの欠損値が確認できた
print(train_df.isnull().sum())

dateid             0
y                  0
week               0
soldout            0
name               0
kcal              33
remarks          288
event            309
payday           320
weather            0
precipitation      0
temperature        0
dtype: int64


In [6]:
## kcal(float64)の欠損値→　①平均値で補完する、②前後の値を比較して補完する
print(train_df['kcal'].value_counts())
print('--------------------------------')
### remarks(objects)の欠損値　→　'特になし'で補完する(一応)
print(train_df['remarks'].value_counts())
print('--------------------------------')
## event(objects)の欠損値　→　'特になし'で補完する
print(train_df['event'].value_counts())
print('--------------------------------')
## payday(float64)の欠損値　→　給料日の以外は0.0とする
print(train_df['payday'].value_counts())

420.0    27
430.0    21
435.0    13
450.0    13
400.0    13
         ..
388.0     1
324.0     1
401.0     1
355.0     1
463.0     1
Name: kcal, Length: 81, dtype: int64
--------------------------------
お楽しみメニュー          28
料理長のこだわりメニュー       7
コンビニ改装             4
料理長イチオシ！           2
今月のイチオシ!           2
今月のこだわりメニュー        2
スペシャルメニュー          1
近隣に飲食店複合ビルオープン     1
料理長イチオシ!           1
弁当種類増↓             1
Name: remarks, dtype: int64
--------------------------------
ママの会             14
キャリアアップ支援セミナー    14
Name: event, dtype: int64
--------------------------------
1.0    17
Name: payday, dtype: int64


In [7]:
### 訓練用データの欠損値の補完(バージョン1)
train_df['kcal'] = train_df['kcal'].fillna(method = 'ffill')
train_df = train_df.fillna({'payday': 0.0, 'remarks':'特になし', 'event' : '特になし'})

In [11]:
train_df

Unnamed: 0,dateid,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature
0,2014-04-01,125.0,Tue,1,メンチカツ,377.0,特になし,特になし,0.0,晴時々薄曇,0.0,13.9
1,2014-04-02,99.0,Wed,0,肉じゃが,380.0,特になし,特になし,0.0,薄曇一時晴,0.0,15.2
2,2014-04-03,100.0,Thu,1,酢豚,390.0,特になし,特になし,0.0,大雨,65.0,13.8
3,2014-04-04,90.0,Fri,1,和風ソースハンバーグ,350.0,特になし,特になし,0.0,晴時々曇一時雨、雷を伴う,40.0,15.3
4,2014-04-07,107.0,Mon,0,青梗菜牛肉炒め,387.0,特になし,特になし,0.0,晴後一時薄曇,0.0,11.4
...,...,...,...,...,...,...,...,...,...,...,...,...
332,2015-08-21,137.0,Fri,0,チキンカレー,418.0,お楽しみメニュー,特になし,0.0,曇後一時雨,2.5,26.0
333,2015-09-01,80.0,Tue,1,自家製手作りトンカツ,423.0,弁当種類増↓,特になし,0.0,雨後曇、雷を伴う,12.0,23.7
334,2015-09-02,74.0,Wed,0,チーズメンチカツ,463.0,特になし,特になし,0.0,曇時々雨後晴,5.5,26.9
335,2015-09-03,66.0,Thu,1,鶏肉黒胡椒焼きおろしソース,420.0,特になし,特になし,0.0,曇後一時雨,1.5,25.7


In [9]:
print(train_df.describe())

                y     soldout        kcal      payday  precipitation  \
count  337.000000  337.000000  337.000000  337.000000     337.000000   
mean    65.192878    0.462908  419.189911    0.050445       5.376855   
std     20.645712    0.499364   24.745120    0.219187      14.342239   
min     25.000000    0.000000  324.000000    0.000000       0.000000   
25%     53.000000    0.000000  410.000000    0.000000       0.000000   
50%     62.000000    0.000000  423.000000    0.000000       0.000000   
75%     72.000000    1.000000  435.000000    0.000000       4.500000   
max    137.000000    1.000000  463.000000    1.000000     123.500000   

       temperature  
count   337.000000  
mean     18.520772  
std       7.708158  
min       2.000000  
25%      12.300000  
50%      20.300000  
75%      24.000000  
max      31.500000  


### 各カラムとyの関係を簡単に調査

In [12]:
train_df.groupby('remarks').max()

Unnamed: 0_level_0,dateid,y,week,soldout,name,kcal,event,payday,weather,precipitation,temperature
remarks,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
お楽しみメニュー,2015-08-21,137.0,Wed,1,牛丼,455.0,特になし,1.0,雨後曇,13.0,31.5
コンビニ改装,2015-08-17,63.0,Wed,1,親子煮,423.0,特になし,1.0,雨時々曇,33.5,29.4
スペシャルメニュー,2014-09-26,45.0,Fri,0,ランチビュッフェ,394.0,キャリアアップ支援セミナー,0.0,晴,0.0,22.9
今月のこだわりメニュー,2015-07-22,70.0,Wed,1,自家製手作りトンカツ,422.0,特になし,0.0,曇一時雨,14.5,28.9
今月のイチオシ!,2015-06-25,53.0,Thu,0,自家製手作りトンカツ,455.0,特になし,0.0,薄曇,0.0,24.3
弁当種類増↓,2015-09-01,80.0,Tue,1,自家製手作りトンカツ,423.0,特になし,0.0,雨後曇、雷を伴う,12.0,23.7
料理長のこだわりメニュー,2014-12-17,66.0,Wed,1,豚ロースのピザ風チーズ焼き,450.0,特になし,0.0,雨一時曇,19.5,31.1
料理長イチオシ!,2015-04-21,76.0,Tue,1,ミックスグリル,450.0,特になし,0.0,曇,0.5,17.5
料理長イチオシ！,2015-03-10,90.0,Wed,1,牛肉とキノコの赤ワイン煮,440.0,特になし,1.0,雨、みぞれを伴う,10.5,7.7
特になし,2015-09-04,125.0,Wed,1,麻婆豆腐,463.0,特になし,1.0,雨時々曇一時雪,123.5,31.1


In [13]:
train_df.groupby('weather').sum()

Unnamed: 0_level_0,y,soldout,kcal,payday,precipitation,temperature
weather,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
大雨,297.0,2,1250.0,0.0,240.5,51.2
大雨一時曇,47.0,1,421.0,0.0,35.5,5.4
大雨後一時曇,65.0,1,420.0,0.0,67.5,21.9
大雨後晴,56.0,1,400.0,0.0,123.5,21.3
快晴,1870.0,16,13526.0,2.0,0.0,412.8
...,...,...,...,...,...,...
雨後曇、雷を伴う,80.0,1,423.0,0.0,12.0,23.7
雨後曇一時晴,70.0,1,428.0,0.0,5.5,19.3
雨時々みぞれ,89.0,0,433.0,0.0,9.0,2.6
雨時々曇,741.0,7,4843.0,0.0,102.5,248.1


In [1]:
train_df.groupby('name').sum().sort_values('y',ascending=False).head(5)

NameError: name 'train_df' is not defined

## データの前処理

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
le = LabelEncoder()

### 前処理1(天気とメニューをラベルエンコーディング)

In [18]:
le_1 = le.fit(train_df['weather'])
train_df['weather_label'] = le_1.transform(train_df['weather'])

In [19]:
le_2 = le.fit(train_df['name'])
train_df['name_label'] = le_1.transform(train_df['name'])

In [20]:
train_df

Unnamed: 0,dateid,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature,weather_label,name_label
0,2014-04-01,125.0,Tue,1,メンチカツ,377.0,特になし,特になし,0.0,晴時々薄曇,0.0,13.9,22,98
1,2014-04-02,99.0,Wed,0,肉じゃが,380.0,特になし,特になし,0.0,薄曇一時晴,0.0,15.2,47,158
2,2014-04-03,100.0,Thu,1,酢豚,390.0,特になし,特になし,0.0,大雨,65.0,13.8,0,183
3,2014-04-04,90.0,Fri,1,和風ソースハンバーグ,350.0,特になし,特になし,0.0,晴時々曇一時雨、雷を伴う,40.0,15.3,19,109
4,2014-04-07,107.0,Mon,0,青梗菜牛肉炒め,387.0,特になし,特になし,0.0,晴後一時薄曇,0.0,11.4,9,185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,2015-08-21,137.0,Fri,0,チキンカレー,418.0,お楽しみメニュー,特になし,0.0,曇後一時雨,2.5,26.0,32,52
333,2015-09-01,80.0,Tue,1,自家製手作りトンカツ,423.0,弁当種類増↓,特になし,0.0,雨後曇、雷を伴う,12.0,23.7,57,163
334,2015-09-02,74.0,Wed,0,チーズメンチカツ,463.0,特になし,特になし,0.0,曇時々雨後晴,5.5,26.9,44,66
335,2015-09-03,66.0,Thu,1,鶏肉黒胡椒焼きおろしソース,420.0,特になし,特になし,0.0,曇後一時雨,1.5,25.7,32,209


### 前処理2 ('weather'をカウントエンコーディング，'name'をラベルエンコーディング)

In [60]:
train_df['weather_count'] = train_df.groupby('weather')['weather'].transform('count')
le_2 = le.fit(train_df['name'])
train_df['name_label'] = le_2.transform(train_df['name'])

#### 特記事項があれば'1'を、なければ'0'を出力

In [21]:
train_df['remarks_b']  = train_df['remarks']
train_df.loc[train_df.remarks != '特になし','remarks_b']='1'
train_df.loc[train_df.remarks == '特になし','remarks_b']='0'

In [22]:
##### 訓練用データの現状を確認
train_df

Unnamed: 0,dateid,y,week,soldout,name,kcal,remarks,event,payday,weather,precipitation,temperature,weather_label,name_label,remarks_b
0,2014-04-01,125.0,Tue,1,メンチカツ,377.0,特になし,特になし,0.0,晴時々薄曇,0.0,13.9,22,98,0
1,2014-04-02,99.0,Wed,0,肉じゃが,380.0,特になし,特になし,0.0,薄曇一時晴,0.0,15.2,47,158,0
2,2014-04-03,100.0,Thu,1,酢豚,390.0,特になし,特になし,0.0,大雨,65.0,13.8,0,183,0
3,2014-04-04,90.0,Fri,1,和風ソースハンバーグ,350.0,特になし,特になし,0.0,晴時々曇一時雨、雷を伴う,40.0,15.3,19,109,0
4,2014-04-07,107.0,Mon,0,青梗菜牛肉炒め,387.0,特になし,特になし,0.0,晴後一時薄曇,0.0,11.4,9,185,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,2015-08-21,137.0,Fri,0,チキンカレー,418.0,お楽しみメニュー,特になし,0.0,曇後一時雨,2.5,26.0,32,52,1
333,2015-09-01,80.0,Tue,1,自家製手作りトンカツ,423.0,弁当種類増↓,特になし,0.0,雨後曇、雷を伴う,12.0,23.7,57,163,1
334,2015-09-02,74.0,Wed,0,チーズメンチカツ,463.0,特になし,特になし,0.0,曇時々雨後晴,5.5,26.9,44,66,0
335,2015-09-03,66.0,Thu,1,鶏肉黒胡椒焼きおろしソース,420.0,特になし,特になし,0.0,曇後一時雨,1.5,25.7,32,209,0


In [23]:
### 説明変数から不要なものを削除(前処理2の場合)
## 説明変数
X = train_df.drop(['dateid','y','name','remarks','weather'],axis = 1)
## 目的変数
y = train_df['y']

In [24]:
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337 entries, 0 to 336
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   week           337 non-null    object 
 1   soldout        337 non-null    int64  
 2   kcal           337 non-null    float64
 3   event          337 non-null    object 
 4   payday         337 non-null    float64
 5   precipitation  337 non-null    float64
 6   temperature    337 non-null    float64
 7   weather_label  337 non-null    int32  
 8   name_label     337 non-null    int32  
 9   remarks_b      337 non-null    object 
dtypes: float64(4), int32(2), int64(1), object(3)
memory usage: 23.8+ KB
None


In [25]:
X['remarks_b'] = X['remarks_b'].astype('int64')
print(X.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337 entries, 0 to 336
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   week           337 non-null    object 
 1   soldout        337 non-null    int64  
 2   kcal           337 non-null    float64
 3   event          337 non-null    object 
 4   payday         337 non-null    float64
 5   precipitation  337 non-null    float64
 6   temperature    337 non-null    float64
 7   weather_label  337 non-null    int32  
 8   name_label     337 non-null    int32  
 9   remarks_b      337 non-null    int64  
dtypes: float64(4), int32(2), int64(2), object(2)
memory usage: 23.8+ KB
None


In [26]:
## 週データのダミー変数化
X = pd.get_dummies(X,columns=['week'])

In [27]:
##13時開始お弁当持ち込み可の社内イベント'のダミー変数化
X = pd.get_dummies(X,columns=['event'])
print(X)

     soldout   kcal  payday  precipitation  temperature  weather_label  \
0          1  377.0     0.0            0.0         13.9             22   
1          0  380.0     0.0            0.0         15.2             47   
2          1  390.0     0.0           65.0         13.8              0   
3          1  350.0     0.0           40.0         15.3             19   
4          0  387.0     0.0            0.0         11.4              9   
..       ...    ...     ...            ...          ...            ...   
332        0  418.0     0.0            2.5         26.0             32   
333        1  423.0     0.0           12.0         23.7             57   
334        0  463.0     0.0            5.5         26.9             44   
335        1  420.0     0.0            1.5         25.7             32   
336        1  435.0     0.0           14.0         24.7             33   

     name_label  remarks_b  week_Fri  week_Mon  week_Thu  week_Tue  week_Wed  \
0            98          0     

In [28]:
###特徴量重要度を測る指標の作成
X_feature = np.array(list(X.columns))

In [29]:
## 値の要素化
X = X.values
y = y.values

In [30]:
print(y.shape)

(337,)


In [31]:
from sklearn.linear_model import LinearRegression as LR

In [32]:
X_train,X_test, y_train,y_test = X[:270],X[270:],y[:270],y[270:]

### 線形回帰　→　質的変数が多くあまり向いていないかも

In [23]:
model = LR()
model.fit(X_train,y_train)

In [41]:
from sklearn.metrics import mean_squared_error as MSE

In [25]:
#予測値の算出
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
#MSEの算出
mse_train = MSE(y_train, y_pred_train)
mse_test = MSE(y_test, y_pred_test)
#RMSEの算出
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)
#RMSEの表示
print(rmse_train)
print(rmse_test)

17.323901440486114
18.293028063831915


### ランダムフォレスト

In [33]:
from sklearn.ensemble import RandomForestRegressor

In [96]:
params = {"n_estimators": [10, 50,100, 300, 500], 
          "max_depth": [3,5,10,15, 20,None], 
          "max_features": ["sqrt", "log2",None]}

In [37]:
## ハイパラメータ探索
from sklearn.model_selection import GridSearchCV

In [514]:
model2 = RandomForestRegressor(random_state = 30)

In [515]:
model3 = GridSearchCV(model2, params,scoring="neg_mean_squared_error")

In [516]:
model3.fit(X_train,y_train)
print(model3.best_params_)

{'max_depth': 3, 'max_features': None, 'n_estimators': 50}


In [517]:
forest = model3.best_estimator_

In [518]:
#予測値の算出
y_pred_train = forest.predict(X_train)
y_pred_test = forest.predict(X_test)
#MSEの算出
mse_train = MSE(y_train, y_pred_train)
mse_test = MSE(y_test, y_pred_test)
#RMSEの算出
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)
#RMSEの表示
print('訓練データに対しては'+ ' ' + str(rmse_train))
print('評価データに対しては'+ ' ' + str(rmse_test))

訓練データに対しては 13.3521383435843
評価データに対しては 16.971582416121947


### 前処理において'name'をラベルエンコーディング，'weather'をカウントエンコーディングとした場合

In [92]:
model2 = RandomForestRegressor(random_state = 45)

In [97]:
model3 = GridSearchCV(model2, params,scoring="neg_mean_squared_error")

In [98]:
model3.fit(X_train,y_train)
print(model3.best_params_)

{'max_depth': 3, 'max_features': None, 'n_estimators': 10}


In [99]:
forest = model3.best_estimator_

In [100]:
#予測値の算出
y_pred_train = forest.predict(X_train)
y_pred_test = forest.predict(X_test)
#MSEの算出
mse_train = MSE(y_train, y_pred_train)
mse_test = MSE(y_test, y_pred_test)
#RMSEの算出
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)
#RMSEの表示
print('訓練データに対しては'+ ' ' + str(rmse_train))
print('評価データに対しては'+ ' ' + str(rmse_test))

訓練データに対しては 13.549414174285275
評価データに対しては 11.874935206148713


#### パラメータ探索なし(デフォルト)

In [75]:
model_d = RandomForestRegressor(random_state = 23)
model_d.fit(X_train,y_train)

In [76]:
#予測値の算出
y_pred_train = model_d.predict(X_train)
y_pred_test = model_d.predict(X_test)
#MSEの算出
mse_train = MSE(y_train, y_pred_train)
mse_test = MSE(y_test, y_pred_test)
#RMSEの算出
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)
#RMSEの表示
print('訓練データに対しては'+ ' ' + str(rmse_train))
print('評価データに対しては'+ ' ' + str(rmse_test))

訓練データに対しては 5.861043390867332
評価データに対しては 10.815385116762167


### 特徴量重要度の確認

In [None]:
feature_importance = pd.DataFrame({'feature':,'importances':model_d.feature_importances_}).sort_values(by="importances", ascending=False)

### 前処理において'name'をラベルエンコーディング，'weather'をラベルエンコーディングとした場合

In [82]:
model2 = RandomForestRegressor(random_state = 65)

In [83]:
params = {"n_estimators": [10, 50,100, 300, 500], 
          "max_depth": [3,5,10,15, 20,None], 
          "max_features": ["sqrt", "log2",None]}

In [84]:
model4 = GridSearchCV(model2, params,scoring="neg_mean_squared_error")

In [85]:
model4.fit(X_train,y_train)
print(model4.best_params_)

{'max_depth': 3, 'max_features': None, 'n_estimators': 10}


In [86]:
forest2 = model4.best_estimator_
#予測値の算出
y_pred_train = forest2.predict(X_train)
y_pred_test = forest2.predict(X_test)
#MSEの算出
mse_train = MSE(y_train, y_pred_train)
mse_test = MSE(y_test, y_pred_test)
#RMSEの算出
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)
#RMSEの表示
print('訓練データに対しては'+ ' ' + str(rmse_train))
print('評価データに対しては'+ ' ' + str(rmse_test))

訓練データに対しては 13.5163796499478
評価データに対しては 11.119677015666902


#### パラメータ探索なし(探索なし)

In [43]:
model_d2 = RandomForestRegressor(random_state = 70)

In [44]:
model_d2.fit(X_train,y_train)

In [92]:
#予測値の算出
y_pred_train = model_d2.predict(X_train)
y_pred_test = model_d2.predict(X_test)
#MSEの算出
mse_train = MSE(y_train, y_pred_train)
mse_test = MSE(y_test, y_pred_test)
#RMSEの算出
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)
#RMSEの表示
print('訓練データに対しては'+ ' ' + str(rmse_train))
print('評価データに対しては'+ ' ' + str(rmse_test))

訓練データに対しては 5.546769763999117
評価データに対しては 10.55591777156302


In [51]:
feature_importance = pd.DataFrame({'feature':X_feature,'importances': model_d2.feature_importances_}).sort_values(by="importances", ascending=False)

In [52]:
### 特徴量重要度の図示
feature_importance

Unnamed: 0,feature,importances
7,remarks_b,0.206514
8,week_Fri,0.168902
6,name_label,0.151057
1,kcal,0.141163
4,temperature,0.130728
5,weather_label,0.08335
3,precipitation,0.049197
0,soldout,0.014732
9,week_Mon,0.013315
10,week_Thu,0.008936


### 評価用データの読み込み

In [60]:
test_df = pd.read_csv(test_path)
print(test_df)

         dateid week  soldout            name   kcal   remarks event  payday  \
0    2015-09-07  Mon        0       豚肉の柚子胡椒炒め  428.0       NaN   NaN     NaN   
1    2015-09-08  Tue        0         鶏肉の塩麹焼き  420.0       NaN   NaN     NaN   
2    2015-09-09  Wed        0       サバの味醂干し焼き  456.0       NaN   NaN     NaN   
3    2015-09-10  Thu        1            塩唐揚げ  439.0       NaN   NaN     1.0   
4    2015-09-11  Fri        0  カレーライス(ポークカレー)    NaN  お楽しみメニュー   NaN     NaN   
..          ...  ...      ...             ...    ...       ...   ...     ...   
139  2016-05-19  Thu        1        豚肉ザーサイ豆腐  490.0       NaN   NaN     NaN   
140  2016-05-20  Fri        1          キーマカレー  485.0       NaN  ママの会     NaN   
141  2016-05-23  Mon        0         鶏肉の照り焼き  505.0       NaN   NaN     NaN   
142  2016-05-24  Tue        1         海鮮チリソース  485.0       NaN   NaN     NaN   
143  2016-05-25  Wed        0     ぶり照り焼き・根菜添え  508.0       NaN   NaN     NaN   

     weather  precipitation  temperatur

In [78]:
test_df['kcal'] = test_df['kcal'].fillna(method = 'ffill')
test_df = test_df.fillna({'payday': 0.0, 'remarks':'特になし', 'event' : '特になし'})
# 名前をラベルエンコーディング
le = le.fit(test_df['name'])
test_df['name_label'] = le.transform(test_df['name'])
# 天気をカウントエンコーディング
test_df['weather_count'] = test_df.groupby('weather')['weather'].transform('count')

In [61]:
test_df['kcal'] = test_df['kcal'].fillna(method = 'ffill')
test_df = test_df.fillna({'payday': 0.0, 'remarks':'特になし', 'event' : '特になし'})

In [62]:
le_1 = le_1.fit(test_df['name'])
test_df['name_label'] = le_1.transform(test_df['name'])

In [63]:
le_2 = le_2.fit(test_df['weather'])
test_df['weather_label'] = le_2.transform(test_df['weather'])

In [64]:
print(test_df)

         dateid week  soldout            name   kcal   remarks event  payday  \
0    2015-09-07  Mon        0       豚肉の柚子胡椒炒め  428.0      特になし  特になし     0.0   
1    2015-09-08  Tue        0         鶏肉の塩麹焼き  420.0      特になし  特になし     0.0   
2    2015-09-09  Wed        0       サバの味醂干し焼き  456.0      特になし  特になし     0.0   
3    2015-09-10  Thu        1            塩唐揚げ  439.0      特になし  特になし     1.0   
4    2015-09-11  Fri        0  カレーライス(ポークカレー)  439.0  お楽しみメニュー  特になし     0.0   
..          ...  ...      ...             ...    ...       ...   ...     ...   
139  2016-05-19  Thu        1        豚肉ザーサイ豆腐  490.0      特になし  特になし     0.0   
140  2016-05-20  Fri        1          キーマカレー  485.0      特になし  ママの会     0.0   
141  2016-05-23  Mon        0         鶏肉の照り焼き  505.0      特になし  特になし     0.0   
142  2016-05-24  Tue        1         海鮮チリソース  485.0      特になし  特になし     0.0   
143  2016-05-25  Wed        0     ぶり照り焼き・根菜添え  508.0      特になし  特になし     0.0   

     weather  precipitation  temperatur

In [65]:
test_df = test_df.drop(['dateid'],axis =1)

In [66]:
print(test_df)

    week  soldout            name   kcal   remarks event  payday  weather  \
0    Mon        0       豚肉の柚子胡椒炒め  428.0      特になし  特になし     0.0     雨時々曇   
1    Tue        0         鶏肉の塩麹焼き  420.0      特になし  特になし     0.0       大雨   
2    Wed        0       サバの味醂干し焼き  456.0      特になし  特になし     0.0  大雨、雷を伴う   
3    Thu        1            塩唐揚げ  439.0      特になし  特になし     1.0     雨一時曇   
4    Fri        0  カレーライス(ポークカレー)  439.0  お楽しみメニュー  特になし     0.0     晴一時曇   
..   ...      ...             ...    ...       ...   ...     ...      ...   
139  Thu        1        豚肉ザーサイ豆腐  490.0      特になし  特になし     0.0     晴後薄曇   
140  Fri        1          キーマカレー  485.0      特になし  ママの会     0.0        曇   
141  Mon        0         鶏肉の照り焼き  505.0      特になし  特になし     0.0   薄曇後時々晴   
142  Tue        1         海鮮チリソース  485.0      特になし  特になし     0.0    薄曇一時晴   
143  Wed        0     ぶり照り焼き・根菜添え  508.0      特になし  特になし     0.0        曇   

     precipitation  temperature  name_label  weather_label  
0             

In [67]:
test_df['remarks_b'] = test_df['remarks']

In [68]:
test_df['remarks_b']  = test_df['remarks']
test_df.loc[test_df.remarks != '特になし','remarks_b']='1'
test_df.loc[test_df.remarks == '特になし','remarks_b']='0'
print(test_df)

    week  soldout            name   kcal   remarks event  payday  weather  \
0    Mon        0       豚肉の柚子胡椒炒め  428.0      特になし  特になし     0.0     雨時々曇   
1    Tue        0         鶏肉の塩麹焼き  420.0      特になし  特になし     0.0       大雨   
2    Wed        0       サバの味醂干し焼き  456.0      特になし  特になし     0.0  大雨、雷を伴う   
3    Thu        1            塩唐揚げ  439.0      特になし  特になし     1.0     雨一時曇   
4    Fri        0  カレーライス(ポークカレー)  439.0  お楽しみメニュー  特になし     0.0     晴一時曇   
..   ...      ...             ...    ...       ...   ...     ...      ...   
139  Thu        1        豚肉ザーサイ豆腐  490.0      特になし  特になし     0.0     晴後薄曇   
140  Fri        1          キーマカレー  485.0      特になし  ママの会     0.0        曇   
141  Mon        0         鶏肉の照り焼き  505.0      特になし  特になし     0.0   薄曇後時々晴   
142  Tue        1         海鮮チリソース  485.0      特になし  特になし     0.0    薄曇一時晴   
143  Wed        0     ぶり照り焼き・根菜添え  508.0      特になし  特になし     0.0        曇   

     precipitation  temperature  name_label  weather_label remarks_b  
0   

In [69]:
### いらないものを消去
test_df = test_df.drop(['name','remarks','weather'],axis = 1)

In [87]:
print(test_df)

     soldout   kcal  payday  precipitation  temperature  name_label  \
0          0  428.0     0.0           19.0         23.0          85   
1          0  420.0     0.0           65.5         20.0         100   
2          0  456.0     0.0          156.5         22.4          28   
3          1  439.0     1.0           50.0         21.8          59   
4          0  439.0     0.0            0.0         23.9          24   
..       ...    ...     ...            ...          ...         ...   
139        1  490.0     0.0            0.0         19.3          89   
140        1  485.0     0.0            0.0         17.2          25   
141        0  505.0     0.0            0.0         23.5         102   
142        1  485.0     0.0            0.0         23.4          66   
143        0  508.0     0.0            0.0         22.2          13   

     weather_label  remarks_b  week_Fri  week_Mon  week_Thu  week_Tue  \
0               44          0         0         1         0         0   
1

In [88]:
test_df['remarks_b'] = test_df['remarks_b'].astype('int64')

In [72]:
test_df = pd.get_dummies(test_df)

In [73]:
print(test_df)

     soldout   kcal  payday  precipitation  temperature  name_label  \
0          0  428.0     0.0           19.0         23.0          85   
1          0  420.0     0.0           65.5         20.0         100   
2          0  456.0     0.0          156.5         22.4          28   
3          1  439.0     1.0           50.0         21.8          59   
4          0  439.0     0.0            0.0         23.9          24   
..       ...    ...     ...            ...          ...         ...   
139        1  490.0     0.0            0.0         19.3          89   
140        1  485.0     0.0            0.0         17.2          25   
141        0  505.0     0.0            0.0         23.5         102   
142        1  485.0     0.0            0.0         23.4          66   
143        0  508.0     0.0            0.0         22.2          13   

     weather_label  remarks_b  week_Fri  week_Mon  week_Thu  week_Tue  \
0               44          0         0         1         0         0   
1

In [74]:
test = test_df.values

In [75]:
pred = model_d2.predict(test)

In [89]:
pred2 = forest2.predict(test)

In [80]:
print(pred)

[ 70.29  67.1   62.18  68.37 110.4   63.42  60.71  55.86  66.2   55.15
  57.91 120.04  69.15  54.74  62.61  75.32  64.2   61.49 110.42  66.93
  63.79  60.56  62.87  46.44  66.37  57.01  65.94  65.99 118.57  56.73
  61.11  68.24  53.94  59.94  67.57  63.52  58.97 110.39  67.64  64.85
  64.66  62.94  67.36  65.34  61.57  62.91  61.05 110.65  65.29  63.84
  53.43  50.84  66.91  77.28 110.36  66.72  65.45  63.17  44.93  73.21
  66.47  68.27  58.37 111.9   70.84  69.38  65.62  58.77  46.83  67.32
  60.31  64.02  61.37 111.77  67.8   67.    65.64  61.89  60.9   61.04
  62.79 109.94  61.53  65.04  64.42  66.35  49.14  65.48  68.37  66.44
  63.03  72.13  59.9   67.42  60.53  49.49  72.58  63.9   64.61  61.43
 111.08  65.08  63.6   55.59  50.33  63.41  63.7   62.51  58.78 111.14
  63.64  61.3   56.98  73.41  53.93  63.73  58.06  58.    61.4  110.04
  64.47  60.33  61.3   61.9  110.97  64.05  62.07  57.74  69.19  66.78
  52.46  65.57  62.04  58.95  59.1  110.71  63.83  69.61  55.89  61.41
  49.6

In [90]:
print(pred2)

[ 61.74561673  64.62245823  60.78803026  62.40791277 118.57751094
  59.02875749  59.02875749  58.73346705  63.66487175  57.06778559
  59.69105352 114.58045211  59.02875749  58.07117101  59.69105352
  64.59715422  61.40804861  61.40804861 118.57751094  59.69105352
  62.39093909  61.40804861  61.40804861  57.04248159  59.69105352
  58.73346705  62.39093909  62.36563508 118.57751094  61.40804861
  62.36563508  64.59715422  56.08489511  61.40804861  62.36563508
  61.40804861  61.40804861 118.57751094  61.40804861  62.36563508
  62.36563508  62.36563508  59.27400072  62.36563508  61.40804861
  61.40804861  62.36563508 118.57751094  62.36563508  61.40804861
  61.40804861  57.04248159  62.36563508  68.73677457 118.57751094
  62.36563508  61.40804861  62.36563508  57.04248159  64.59715422
  62.36563508  62.36563508  62.36563508 118.57751094  62.36563508
  62.36563508  61.40804861  62.36563508  56.08489511  62.36563508
  61.40804861  61.40804861  62.36563508 118.57751094  62.36563508
  62.36563

### 提出ファイルの作成

In [91]:
submit = pd.read_csv(submit_path,header=None)
submit[1] = pred
submit.to_csv("submit7.csv", index=False, header=False)