In [1]:
import re
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingRegressor

In [2]:
train_df = pd.concat([pd.read_csv('../input/train.csv'),
                      pd.read_csv('../input/train_add.csv')])
test_df = pd.read_csv('../input/test.csv')

n_train, n_test = len(train_df), len(test_df)

condition_df = pd.concat([pd.read_csv('../input/condition.csv'),
                          pd.read_csv('../input/condition_add.csv')])
stadium_df = pd.read_csv('../input/stadium.csv')

# 学習データ，テストデータ，追加データを結合
all_df = pd.concat([train_df, test_df])
all_df = pd.merge(all_df, condition_df, on='id')
all_df = pd.merge(all_df, stadium_df, how='left', left_on='stadium', right_on='name')
all_df.index = all_df['id']

all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2266 entries, 13994 to 16436
Data columns (total 44 columns):
away           2266 non-null object
gameday        2266 non-null object
home           2266 non-null object
id             2266 non-null int64
match          2266 non-null object
stadium        2266 non-null object
stage          2266 non-null object
time           2266 non-null object
tv             2266 non-null object
y              1953 non-null float64
year           2266 non-null int64
home_score     2266 non-null int64
away_score     2266 non-null int64
weather        2266 non-null object
temperature    2266 non-null float64
humidity       2266 non-null object
referee        2266 non-null object
home_team      2266 non-null object
home_01        2266 non-null object
home_02        2266 non-null object
home_03        2266 non-null object
home_04        2266 non-null object
home_05        2266 non-null object
home_06        2266 non-null object
home_07        2266 non-nu

## 前処理

In [3]:
# 開催節
all_df['match_sec'] = all_df['match'].apply(lambda x: int(x[1:].split('節')[0]))
all_df['match_day'] = all_df['match'].apply(lambda x: int(x[:-1].rsplit('第', 1)[-1]))

# 開催日時
all_df['gamemonth'] = all_df['gameday'].apply(lambda x: int(x.split('/', 1)[0]))
all_df['weekday'] = all_df['gameday'].apply(lambda x: x[:-1].rsplit('(', 1)[-1])
all_df['holiday'] = all_df['weekday'].apply(lambda x: 1 if x in list('土日') or '祝' in x or '休' in x else 0)
all_df['time_hour'] = all_df['time'].apply(lambda x: int(x.split(':')[0]))

# TV
all_df['terrestrial'] = all_df['tv'].apply(lambda x:1 if any(['ＢＳ－' not in tv and
                                                              'スカパー' not in tv and
                                                              'ＢＳ１' not in tv and
                                                              'ｅ２' not in tv for tv in x.split('／')]) else 0)
all_df['recorded'] = all_df['tv'].apply(lambda x: 1 if '（録）' in x else 0)

# 天気
all_df['rain'] = all_df['weather'].apply(lambda x: 1 if '雨' == x else 0)
all_df['precipitation'] = all_df['weather'].apply(lambda x: 1 if '雨' in x else 0)

# 屋内
all_df['indoor'] = all_df['weather'].apply(lambda x: 1 if x == '屋内' else 0)

# 県
all_df['prefecture'] = all_df['address'].apply(lambda x: re.split(r'都|道|府|県', x)[0])

- カテゴリデータをダミー変数に変換

In [4]:
val_cols = ['match_sec', 'gamemonth', 'holiday', 'time_hour', 'terrestrial', 'recorded',
            'rain', 'precipitation', 'indoor', 'home_score', 'away_score', 'capa',]
cat_cols = ['weekday', 'stage', 'home', 'away', 'stadium', 'weather', 'prefecture']

df = all_df[val_cols].join(pd.get_dummies(all_df[cat_cols]))

X = df[:n_train].values
y = train_df['y'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

## モデルの選択と学習

- いくつかモデルを試してみる

In [5]:
model = GradientBoostingRegressor()
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(-scores)
print('cv score  : {:.5f} +/- {:.5f}'.format(scores.mean(), scores.std()))

model.fit(X_train, y_train)
print('test score: {:.5f}'.format(np.sqrt(mean_squared_error(y_val, model.predict(X_val)))))

cv score  : 3324.69088 +/- 378.87829
test score: 3808.77026


- 線形回帰はデータのスケーリングを行ったほうがよさげ

In [6]:
model = make_pipeline(RobustScaler(), Lasso(alpha=10.0, max_iter=10000))
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(-scores)
print('cv score  : {:.5f} +/- {:.5f}'.format(scores.mean(), scores.std()))

model.fit(X_train, y_train)
print('test score: {:.5f}'.format(np.sqrt(mean_squared_error(y_val, model.predict(X_val)))))

cv score  : 3291.98954 +/- 428.18300
test score: 3729.10982


### ステージ(J1, J2)でモデルを分割
- J1とJ2で観客動員数の分布が異なっていたので，モデルを分けて学習する

In [7]:
X_J1 = df[:n_train][df[:n_train]['stage_Ｊ１'] == 1].values
y_J1 = train_df[train_df['stage'] == 'Ｊ１']['y'].values
n_J1 = len(X_J1)

X_J2 = df[:n_train][df[:n_train]['stage_Ｊ２'] == 1].values
y_J2 = train_df[train_df['stage'] == 'Ｊ２']['y'].values
n_J2 = len(X_J2)

In [8]:
model_J1 = GradientBoostingRegressor()
scores_J1 = -cross_val_score(model_J1, X_J1, y_J1, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(scores_J1)
print('J1 cv score  : {:.5f} +/- {:.5f}'.format(scores.mean(), scores.std()))

model_J2 = GradientBoostingRegressor()
scores_J2 = -cross_val_score(model_J2, X_J2, y_J2, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(scores_J2)
print('J2 cv score  : {:.5f} +/- {:.5f}'.format(scores.mean(), scores.std()))

total = np.sqrt((scores_J1 * n_J1 + scores_J2 * n_J2) / (n_J1 + n_J2))
print('total cv score  : {:.5f} +/- {:.5f}'.format(total.mean(), total.std()))

J1 cv score  : 4756.84703 +/- 851.81560
J2 cv score  : 2010.82720 +/- 225.97153
total cv score  : 3366.44137 +/- 552.53040


In [9]:
model_J1 = make_pipeline(RobustScaler(), Lasso(alpha=20.0, max_iter=10000))
scores_J1 = -cross_val_score(model_J1, X_J1, y_J1, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(scores_J1)
print('J1 cv score  : {:.5f} +/- {:.5f}'.format(scores.mean(), scores.std()))

model_J2 = make_pipeline(RobustScaler(), Lasso(alpha=3.5, max_iter=10000))
scores_J2 = -cross_val_score(model_J2, X_J2, y_J2, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(scores_J2)
print('J2 cv score  : {:.5f} +/- {:.5f}'.format(scores.mean(), scores.std()))

total = np.sqrt((scores_J1 * n_J1 + scores_J2 * n_J2) / (n_J1 + n_J2))
print('total cv score  : {:.5f} +/- {:.5f}'.format(total.mean(), total.std()))

J1 cv score  : 4676.28616 +/- 876.80993
J2 cv score  : 2000.66681 +/- 209.32794
total cv score  : 3319.93063 +/- 553.46979


## Averaging
- 各モデルの予測を平均する

In [10]:
from sklearn.base import BaseEstimator, RegressorMixin

class AverageRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, regressors):
        self.regressors = regressors
    
    def fit(self, X, y):
        for reg in self.regressors:
            reg.fit(X, y)
    
    def predict(self, X, weights=None):
        if weights is None:
            weights = np.ones(len(self.regressors))
        assert len(weights) == len(self.regressors)
        y = [w * reg.predict(X) for reg, w in zip(self.regressors, weights)]
        return np.mean(y, axis=0)

In [11]:
model_J1 = AverageRegressor([GradientBoostingRegressor(), make_pipeline(RobustScaler(), Lasso(alpha=20.0, max_iter=10000))])
scores_J1 = -cross_val_score(model_J1, X_J1, y_J1, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(scores_J1)
print('J1 cv score  : {:.5f} +/- {:.5f}'.format(scores.mean(), scores.std()))

model_J2 = AverageRegressor([GradientBoostingRegressor(), make_pipeline(RobustScaler(), Lasso(alpha=3.5, max_iter=10000))])
scores_J2 = -cross_val_score(model_J2, X_J2, y_J2, cv=5, scoring='neg_mean_squared_error')
scores = np.sqrt(scores_J2)
print('J2 cv score  : {:.5f} +/- {:.5f}'.format(scores.mean(), scores.std()))

total = np.sqrt((scores_J1 * n_J1 + scores_J2 * n_J2) / (n_J1 + n_J2))
print('total cv score  : {:.5f} +/- {:.5f}'.format(total.mean(), total.std()))

J1 cv score  : 4559.94366 +/- 899.49349
J2 cv score  : 1941.64781 +/- 225.32535
total cv score  : 3233.55488 +/- 574.41956


- テストデータを予測

In [12]:
model_J1 = AverageRegressor([GradientBoostingRegressor(), make_pipeline(RobustScaler(), Lasso(alpha=20.0, max_iter=10000))])
model_J2 = AverageRegressor([GradientBoostingRegressor(), make_pipeline(RobustScaler(), Lasso(alpha=3.5, max_iter=10000))])

model_J1.fit(X_J1, y_J1)
model_J2.fit(X_J2, y_J2)

df_J1_test = df[n_train:][df[n_train:]['stage_Ｊ１'] == 1]
df_J2_test = df[n_train:][df[n_train:]['stage_Ｊ２'] == 1]

df_J1_test['y'] = model_J1.predict(df_J1_test.values)
df_J2_test['y'] = model_J2.predict(df_J2_test.values)

In [13]:
df_test = pd.concat([df_J1_test[['y']], df_J2_test[['y']]]).sort_index()
df_test.to_csv('../submit/split_stage.csv', header=None)