In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [2]:
TRAIN_CSV_PATH = "data/train.csv"
TRAIN_ADD_CSV_PATH = "data/train_add.csv"
TEST_CSV_PATH = "data/test.csv"
ADD_2014_CSV_PATH = "data/2014_add.csv"
STADIUM_CSV_PATH = "data/stadium.csv"
CONDITION_CSV_PATH = "data/condition.csv"
CONDITION_ADD_CSV_PATH = "data/condition_add.csv"

In [3]:
df = pd.read_csv(TRAIN_CSV_PATH)

In [4]:
df.head()

Unnamed: 0,id,y,year,stage,match,gameday,time,home,away,stadium,tv
0,13994,18250,2012,Ｊ１,第１節第１日,03/10(土),14:04,ベガルタ仙台,鹿島アントラーズ,ユアテックスタジアム仙台,スカパー／ｅ２／スカパー光／ＮＨＫ総合
1,13995,24316,2012,Ｊ１,第１節第１日,03/10(土),14:04,名古屋グランパス,清水エスパルス,豊田スタジアム,スカパー／ｅ２／スカパー光（Ｊ　ＳＰＯＲＴＳ　４）／ＮＨＫ名古屋
2,13996,17066,2012,Ｊ１,第１節第１日,03/10(土),14:04,ガンバ大阪,ヴィッセル神戸,万博記念競技場,スカパー／ｅ２／スカパー光（Ｊ　ＳＰＯＲＴＳ　１）／ＮＨＫ大阪
3,13997,29603,2012,Ｊ１,第１節第１日,03/10(土),14:06,サンフレッチェ広島,浦和レッズ,エディオンスタジアム広島,スカパー／ｅ２／スカパー光／ＮＨＫ広島
4,13998,25353,2012,Ｊ１,第１節第１日,03/10(土),14:04,コンサドーレ札幌,ジュビロ磐田,札幌ドーム,スカパー／ｅ２／スカパー光（スカイ・Ａ　ｓｐｏｒｔｓ＋）／ＮＨＫ札幌


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 11 columns):
id         1721 non-null int64
y          1721 non-null int64
year       1721 non-null int64
stage      1721 non-null object
match      1721 non-null object
gameday    1721 non-null object
time       1721 non-null object
home       1721 non-null object
away       1721 non-null object
stadium    1721 non-null object
tv         1721 non-null object
dtypes: int64(3), object(8)
memory usage: 148.0+ KB


In [6]:
# 開催年
df["year"].value_counts()

2012    682
2013    675
2014    364
Name: year, dtype: int64

In [7]:
# ステージ
df["stage"].value_counts()

Ｊ２    1046
Ｊ１     675
Name: stage, dtype: int64

In [8]:
# 開催日付
train_gameday = df["gameday"].str.extract(r'(?P<gameday_month>.+)/(?P<gameday_day>.+)\((?P<gameday_weekday>.)・?(?P<gameday_holiday>.?)\)', expand=True)
train_gameday[["gameday_month", "gameday_day"]] = train_gameday[["gameday_month", "gameday_day"]].apply(pd.to_numeric)
train = pd.concat([df, train_gameday], axis=1, sort=False)
train.head()

Unnamed: 0,id,y,year,stage,match,gameday,time,home,away,stadium,tv,gameday_month,gameday_day,gameday_weekday,gameday_holiday
0,13994,18250,2012,Ｊ１,第１節第１日,03/10(土),14:04,ベガルタ仙台,鹿島アントラーズ,ユアテックスタジアム仙台,スカパー／ｅ２／スカパー光／ＮＨＫ総合,3,10,土,
1,13995,24316,2012,Ｊ１,第１節第１日,03/10(土),14:04,名古屋グランパス,清水エスパルス,豊田スタジアム,スカパー／ｅ２／スカパー光（Ｊ　ＳＰＯＲＴＳ　４）／ＮＨＫ名古屋,3,10,土,
2,13996,17066,2012,Ｊ１,第１節第１日,03/10(土),14:04,ガンバ大阪,ヴィッセル神戸,万博記念競技場,スカパー／ｅ２／スカパー光（Ｊ　ＳＰＯＲＴＳ　１）／ＮＨＫ大阪,3,10,土,
3,13997,29603,2012,Ｊ１,第１節第１日,03/10(土),14:06,サンフレッチェ広島,浦和レッズ,エディオンスタジアム広島,スカパー／ｅ２／スカパー光／ＮＨＫ広島,3,10,土,
4,13998,25353,2012,Ｊ１,第１節第１日,03/10(土),14:04,コンサドーレ札幌,ジュビロ磐田,札幌ドーム,スカパー／ｅ２／スカパー光（スカイ・Ａ　ｓｐｏｒｔｓ＋）／ＮＨＫ札幌,3,10,土,


In [9]:
train_gameday["gameday_weekday"].value_counts()

土    733
日    711
水    124
金     46
月     43
火     42
木     22
Name: gameday_weekday, dtype: int64

In [10]:
train_gameday["gameday_holiday"].value_counts()

     1551
祝     130
休      40
Name: gameday_holiday, dtype: int64

In [11]:
train_gameday.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 4 columns):
gameday_month      1721 non-null int64
gameday_day        1721 non-null int64
gameday_weekday    1721 non-null object
gameday_holiday    1721 non-null object
dtypes: int64(2), object(2)
memory usage: 53.9+ KB


In [12]:
train.corr()["y"]

id              -0.175626
y                1.000000
year             0.003928
gameday_month    0.107451
gameday_day     -0.007215
Name: y, dtype: float64

In [13]:
dummy_df = pd.get_dummies(train, columns=['stage', 'home', 'away', 'gameday_weekday', 'gameday_holiday'])
dummy_df.head()

Unnamed: 0,id,y,year,match,gameday,time,stadium,tv,gameday_month,gameday_day,...,gameday_weekday_土,gameday_weekday_日,gameday_weekday_月,gameday_weekday_木,gameday_weekday_水,gameday_weekday_火,gameday_weekday_金,gameday_holiday_,gameday_holiday_休,gameday_holiday_祝
0,13994,18250,2012,第１節第１日,03/10(土),14:04,ユアテックスタジアム仙台,スカパー／ｅ２／スカパー光／ＮＨＫ総合,3,10,...,1,0,0,0,0,0,0,1,0,0
1,13995,24316,2012,第１節第１日,03/10(土),14:04,豊田スタジアム,スカパー／ｅ２／スカパー光（Ｊ　ＳＰＯＲＴＳ　４）／ＮＨＫ名古屋,3,10,...,1,0,0,0,0,0,0,1,0,0
2,13996,17066,2012,第１節第１日,03/10(土),14:04,万博記念競技場,スカパー／ｅ２／スカパー光（Ｊ　ＳＰＯＲＴＳ　１）／ＮＨＫ大阪,3,10,...,1,0,0,0,0,0,0,1,0,0
3,13997,29603,2012,第１節第１日,03/10(土),14:06,エディオンスタジアム広島,スカパー／ｅ２／スカパー光／ＮＨＫ広島,3,10,...,1,0,0,0,0,0,0,1,0,0
4,13998,25353,2012,第１節第１日,03/10(土),14:04,札幌ドーム,スカパー／ｅ２／スカパー光（スカイ・Ａ　ｓｐｏｒｔｓ＋）／ＮＨＫ札幌,3,10,...,1,0,0,0,0,0,0,1,0,0


In [14]:
dummy_df.corr()["y"]

id                  -0.175626
y                    1.000000
year                 0.003928
gameday_month        0.107451
gameday_day         -0.007215
stage_Ｊ１             0.668721
stage_Ｊ２            -0.668721
home_アビスパ福岡         -0.107548
home_アルビレックス新潟       0.266775
home_カターレ富山         -0.135049
home_カマタマーレ讃岐       -0.064322
home_ガイナーレ鳥取        -0.122904
home_ガンバ大阪           0.052224
home_ギラヴァンツ北九州      -0.144699
home_コンサドーレ札幌        0.002829
home_サガン鳥栖           0.035922
home_サンフレッチェ広島       0.104590
home_ザスパクサツ群馬       -0.111884
home_ザスパ草津          -0.090329
home_ジェフユナイテッド千葉    -0.024496
home_ジュビロ磐田          0.010005
home_セレッソ大阪          0.157785
home_ファジアーノ岡山       -0.049701
home_ベガルタ仙台          0.088980
home_モンテディオ山形       -0.076348
home_ロアッソ熊本         -0.094660
home_ヴァンフォーレ甲府       0.025006
home_ヴィッセル神戸         0.052722
home_京都サンガF.C.      -0.064781
home_名古屋グランパス        0.102889
                       ...   
away_大分トリニータ        -0.040891
away_大宮アルディージャ       0.102431
away_川崎フロン

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
train = dummy_df.drop(["y", "match", "gameday", "time", "stadium", "tv"], axis=1)
y = dummy_df["y"].copy()

In [17]:
def convertData(dataframe, isTrain=True):
    train_gameday = dataframe["gameday"].str.extract(r'(?P<gameday_month>.+)/(?P<gameday_day>.+)\((?P<gameday_weekday>.)・?(?P<gameday_holiday>.?)\)', expand=True)
    train_gameday[["gameday_month", "gameday_day"]] = train_gameday[["gameday_month", "gameday_day"]].apply(pd.to_numeric)
    dataframe = pd.concat([dataframe, train_gameday], axis=1, sort=False)
    dataframe = pd.get_dummies(dataframe, columns=['stage', 'home', 'away', 'gameday_weekday', 'gameday_holiday'])
    dataframe = dataframe.drop(["match", "gameday", "time", "stadium", "tv"], axis=1)
    if isTrain:
        dataframe = dataframe.drop("y", axis=1)
    return dataframe

In [18]:
x_train = convertData(df)

In [19]:
x_train.head()

Unnamed: 0,id,year,gameday_month,gameday_day,stage_Ｊ１,stage_Ｊ２,home_アビスパ福岡,home_アルビレックス新潟,home_カターレ富山,home_カマタマーレ讃岐,...,gameday_weekday_土,gameday_weekday_日,gameday_weekday_月,gameday_weekday_木,gameday_weekday_水,gameday_weekday_火,gameday_weekday_金,gameday_holiday_,gameday_holiday_休,gameday_holiday_祝
0,13994,2012,3,10,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,13995,2012,3,10,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2,13996,2012,3,10,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,13997,2012,3,10,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,13998,2012,3,10,1,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [20]:
test_df = pd.read_csv(TEST_CSV_PATH)

In [21]:
test_df.head()

Unnamed: 0,id,year,stage,match,gameday,time,home,away,stadium,tv
0,15822,2014,Ｊ１,第１８節第１日,08/02(土),19:04,ベガルタ仙台,大宮アルディージャ,ユアテックスタジアム仙台,スカパー！／スカパー！プレミアムサービス
1,15823,2014,Ｊ１,第１８節第１日,08/02(土),18:34,鹿島アントラーズ,サンフレッチェ広島,県立カシマサッカースタジアム,スカパー！／スカパー！プレミアムサービス
2,15824,2014,Ｊ１,第１８節第１日,08/02(土),19:04,浦和レッズ,ヴィッセル神戸,埼玉スタジアム２００２,スカパー！／スカパー！プレミアムサービス／ＮＨＫ　ＢＳ１／テレ玉
3,15825,2014,Ｊ１,第１８節第１日,08/02(土),19:03,柏レイソル,川崎フロンターレ,日立柏サッカー場,スカパー！／スカパー！プレミアムサービス
4,15827,2014,Ｊ１,第１８節第１日,08/02(土),19:03,アルビレックス新潟,セレッソ大阪,デンカビッグスワンスタジアム,スカパー！／スカパー！プレミアムサービス


In [22]:
x_test = convertData(test_df, isTrain=False)

In [23]:
x_test.head()

Unnamed: 0,id,year,gameday_month,gameday_day,stage_Ｊ１,stage_Ｊ２,home_アビスパ福岡,home_アルビレックス新潟,home_カターレ富山,home_カマタマーレ讃岐,...,away_ＦＣ東京,away_Ｖ・ファーレン長崎,gameday_weekday_土,gameday_weekday_日,gameday_weekday_月,gameday_weekday_水,gameday_weekday_火,gameday_weekday_金,gameday_holiday_,gameday_holiday_祝
0,15822,2014,8,2,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
1,15823,2014,8,2,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,15824,2014,8,2,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,15825,2014,8,2,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,15827,2014,8,2,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0


In [24]:
col_list = list(x_test.columns.values)

In [25]:
x_train = x_train[col_list]

In [27]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [36]:
y_result = lin_reg.predict(x_test)
result = pd.concat([x_test["id"], pd.Series(y_result)], axis=1, sort=False)
result.head()

Unnamed: 0,id,0
0,15822,14913.981093
1,15823,18577.109936
2,15824,35162.633091
3,15825,14848.095298
4,15827,28953.056352


In [41]:
result.to_csv('submit1.csv', sep=',', encoding='utf-8', index=False, header=False)

RSME = 3,824.18052 でした！