In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# データ読込

In [2]:
# CSVファイルを読み込む
train = pd.read_csv('./train.csv', encoding='utf-8')
test = pd.read_csv('./test.csv', encoding='utf-8')
condition = pd.read_csv('./condition.csv', encoding='utf-8')
stadium = pd.read_csv('./stadium.csv', encoding='utf-8')
sample = pd.read_csv('./sample_submit.csv', encoding='utf-8')

# データ結合(不足分)

In [3]:
# trainとtrain_addを結合
train_add = pd.read_csv('./train_add.csv', encoding='utf-8')
train = train.append(train_add, ignore_index=True)

# conditionとcondition_addを結合
condition_add = pd.read_csv('./condition_add.csv', encoding='utf-8')
condition = condition.append(condition_add, ignore_index=True)

# 質的変数を数値に置換

In [4]:
# weatherを置換(降雨：１，その他：０)
def rain(n):
    if ('雨' in n):
         return 1
    else:
        return 0
condition['weather'] = condition['weather'].map(rain)

# スタジアムの住所を置換
#def address(n):
#    return n[:3]
#stadium['address_no'] = pd.factorize(stadium['address'].map(address))[0]

# チーム番号を振る
# チームの表を取得
team = []
team = pd.DataFrame(data=(list(set(condition['home_team']))))
# チーム番号を振る
team['team_no'] = pd.factorize(team[0])[0]
# チーム番号表を結合
tmp = pd.merge(condition, team, left_on='home_team', right_on=0, how='left')
condition = pd.merge(tmp, team, left_on='away_team', right_on=0, how='left')

# humidityを置換
condition['humidity'] = pd.factorize(condition['humidity'])[0]
# refereeを置換
condition['referee'] = pd.factorize(condition['referee'])[0]

# 曜日を置換(休日：1，平日：0)
import re
def day(n):
    if ('土' in n or '日' in n or '祝' in n or '休' in n):
         return 1
    else:
        return 0
train['gameday'] = train['gameday'].map(day)
test['gameday'] = test['gameday'].map(day)

# 試合開始時間ごとに数値に置き換える
import datetime
def date(n):
    date = datetime.datetime.strptime(n, '%H:%M')
    #if (date.hour >= 19):
    #    return 0
    #elif(date.hour >= 15):
    #    return 1
   # else:
    #    return 2
    return date.hour
train['time'] = list(map(date, train['time']))
test['time'] = list(map(date, test['time']))

# 試合節
def match(n):
    num = int(re.sub('\\D', '', n)[:-1])
    #if(num < 100):
    #    return 0
    #elif(num <200):
    #    return 1
    #elif(num <300):
    #    return 2
    #elif(num <400):
    #    return 3
    #else:
    #    return 4
    return num

train['match'] = train['match'].map(match)
test['match'] = test['match'].map(match)
def match_id(match, year, stage):
    tmp = str(year) + str(stage) + str(match)
    return int(tmp)

# stageを数値に置き換える
train['stage'] = train['stage'].replace(['Ｊ１','Ｊ２'],[1,2])
test['stage'] = test['stage'].replace(['Ｊ１','Ｊ２'],[1,2])
# スタジアムの場所(都道府県)
def address(n):
    return n[:3]
stadium['address_no'] = pd.factorize(stadium['address'].map(address))[0]

# データ調整

In [5]:
# 無観客試合を除く
train = train[train['y'] != 0]
# 前半戦の試合を除外する
train[train['match'] != 0]
# チームごとに入場者数を調整
def home_updown(team, y):
    if ('浦和レッズ' in team):
        y -= 5000
    if (y <= 0):
        y = 0
    return y
def away_updown(team, y):
    if ('浦和レッズ' in team):
        y -= 2000
    if (y <= 0):
        y = 0
    return y
train['y'] = list(map(home_updown, train['home'], train['y']))
#train['y'] = list(map(away_updown, train['away'], train['y']))

# データ結合

In [6]:
# trainとconditionを対戦カードidを基準に結合
tmp1 = pd.merge(train, condition, on='id', how='left')

# testとconditionを対戦カードidを基準に結合
tmp2 = pd.merge(test, condition, on='id', how='left')

In [7]:
# 結合
train_new = pd.merge(tmp1, stadium, left_on='stadium', right_on='name', how='left')
test_new = pd.merge(tmp2, stadium, left_on='stadium', right_on='name', how='left')

# 線形回帰モデル

In [8]:
from sklearn.model_selection import train_test_split

# 変数選択(目的変数, ステージ, 気温, 収容人数, 天気, ホームチーム)
lm_train = train_new.loc[:, ['y', 'stage', 'capa', 'team_no_x', 'team_no_y', 'referee', 'gameday', 'weather', 'match', 'time']]
lm_test = test_new.loc[:, ['stage', 'capa', 'team_no_x', 'team_no_y', 'referee', 'gameday', 'weather', 'match', 'time']]
# 目的変数と説明変数に分ける
y = np.array(lm_train['y'])
X = np.array(lm_train.drop('y', axis=1))

In [9]:
from sklearn.model_selection import KFold
import statsmodels.api as sm


# クロスバリデーション
cv = KFold(n_splits=5)
sub_pred = np.zeros(len(lm_test))
sub_R2 = 0
for train_index, valid_index in cv.split(X, y):
    train_X = X[train_index]
    valid_X = X[valid_index]
    train_y = y[train_index]
    valid_y = y[valid_index]
    # 回帰モデル作成
    model = sm.OLS(train_y, sm.add_constant(train_X)).fit()
    # trainで予測
    y_pred = model.predict(sm.add_constant(valid_X))
    # 決定係数を算出
    se = np.sum(np.square(valid_y-y_pred))
    st = np.sum(np.square(valid_y-np.mean(valid_y)))
    R2 = 1-se/st
    print('決定係数：', R2)
    sub_R2 += R2
    print(model.summary())
    
    # 予測値を算出
    test_pred = model.predict(sm.add_constant(lm_test))
    # 変数に格納する
    sub_pred += test_pred

# 予測値の平均を出す
sub_pred /= 5
sub_R2 /= 5
print('決定係数(平均)：',sub_R2)

決定係数： 0.7004561099195401
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.640
Model:                            OLS   Adj. R-squared:                  0.638
Method:                 Least Squares   F-statistic:                     307.0
Date:                Thu, 20 Aug 2020   Prob (F-statistic):               0.00
Time:                        15:36:59   Log-Likelihood:                -15324.
No. Observations:                1561   AIC:                         3.067e+04
Df Residuals:                    1551   BIC:                         3.072e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.479e+04   127

  return ptp(axis=axis, out=out, **kwargs)


In [10]:
# 相関を計算
corr = lm_test.corr()

# 0でない変数が相関している可能性あり
(np.sign(model.params[1:]) - np.sign(corr['gameday'])).dropna()

stage       -2.0
capa         2.0
team_no_x    0.0
team_no_y    0.0
referee     -2.0
gameday      0.0
weather      0.0
match        0.0
time         0.0
Name: gameday, dtype: float64

In [11]:
# 残差確認
pred = model.predict(sm.add_constant(lm_train.drop('y', axis=1)))
pred = np.abs(pred)
ei = []
ei = lm_train['y'] - pred
tmp2 = lm_train
lm_train['zan'] = pred - lm_train['y']
lm_train['pred'] = pred

print(lm_train.query('zan >= 15000').sort_values('zan',ascending=False))
print(lm_train.query('zan <= -15000').sort_values('zan',ascending=False))

Empty DataFrame
Columns: [y, stage, capa, team_no_x, team_no_y, referee, gameday, weather, match, time, zan, pred]
Index: []
          y  stage   capa  team_no_x  team_no_y  referee  gameday  weather  \
133   36723      1  47816         15          2       17        1        1   
1448  35533      1  42300         20         32        2        1        0   
130   35506      1  42300         20         29       10        1        0   
895   42790      1  63700          2         24       17        1        1   
1822  41649      1  63700          2         29       14        1        1   
917   40371      1  49970         19         15       12        1        0   
1359  37079      1  47816         15         31       17        1        1   
1417  40761      1  49970         19         15       17        1        0   
270   46879      1  63700          2         30        8        1        0   
1442  38966      1  40000         30         15        6        1        0   
1408  42723      

In [12]:
# CSVファイル出力
# 予測スコアを追加する
out = pd.DataFrame({'id':test['id'], 'y':sub_pred})

import csv
# ファイル出力
out.to_csv('submit_0628_1_lm.csv', index=False, header=False)

In [13]:
import csv
from pandas.util.testing import assert_frame_equal
from unittest import TestCase

# 作成したCSVファイルを読み込む
submit = pd.read_csv('./submit_0628_1_lm.csv')

# データの長さが一致しているか確認する
assert len(sample) == len(submit), 'The amount of data do not match'

# idが一致しているか確認する
assert sample.columns[0] == submit.columns[0], '"id" mismatch'

#2列目の列名「y」が入っていないか検証
assert 'y' != submit.columns[0],  'index mismatch'

### 以下は実装途中(チーム順位を求めようとしています）

In [14]:
condition = pd.read_csv('./condition.csv', encoding='utf-8')
# チーム番号を振る
# チームの表を取得
team = []
team = pd.DataFrame(data=(list(set(condition['home_team']))))
# チーム番号を振る
team['team_no'] = pd.factorize(team[0])[0]
# 結合
tmp = pd.merge(condition, team, left_on='home_team', right_on=0, how='left')
condition = pd.merge(tmp, team, left_on='away_team', right_on=0, how='left')

# 点数を勝ち点に変換
def team_point(home_point, away_point):
    home_score = []
    away_score = []
    for home, away in zip(home_point, away_point):
        if home > away:
            home_score.append(3)
            away_score.append(0)
        elif home == away:
            home_score.append(1)
            away_score.append(1)
        elif home < away:
            home_score.append(0)
            away_score.append(3)
    return home_score, away_score

condition['home_score'], condition['away_score'] = team_point(condition['home_score'], condition['away_score'])

In [15]:
# trainとconditionを対戦カードidを基準に結合
train = pd.merge(train, condition, on='id', how='left')

In [16]:
# 試合節、試合開催年、ステージを合わせたmatch_IDを作成
def match_id(match, year, stage):
    tmp = str(year) + str(stage) + str(match)
    return int(tmp)

train['match_ID'] = list(map(match_id, train['match'], train['year'], train['stage']))

# 試合ごとの点数を集計
ranks = pd.DataFrame(index=train['id'], columns=team[0])
ranks['match_ID'] = train['match_ID']
ranks.head(3)

Unnamed: 0_level_0,湘南ベルマーレ,ガンバ大阪,浦和レッズ,ガイナーレ鳥取,アビスパ福岡,ロアッソ熊本,ＦＣ岐阜,川崎フロンターレ,ＦＣ町田ゼルビア,サガン鳥栖,...,柏レイソル,ファジアーノ岡山,ジェフユナイテッド千葉,Ｖ・ファーレン長崎,カターレ富山,ヴァンフォーレ甲府,横浜Ｆ・マリノス,横浜ＦＣ,京都サンガF.C.,match_ID
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13994,,,,,,,,,,,...,,,,,,,,,,
13995,,,,,,,,,,,...,,,,,,,,,,
13996,,,,,,,,,,,...,,,,,,,,,,


### ranksに足し上げられた勝ち点を入れていこうとしています

In [17]:
# match→試合節、ranks→チーム一覧、
# 試合節ごとに渡す？
# scoreには事前に前のチームのスコアを足しておく
# ID_be=前の試合節のID　ID＝今の試合節のID
# home_team=team,score
# away_team=team,score

# ranks表に勝ち点を入れていく
def input_score(home_score, away_score, home_team, away_team, ID_be, ID):
    
    # 前の試合節の点数を取得
    for IDs in ID_be:
        # シーズン初めての試合の場合はスコアを0にする
        if IDs == NaN:
            score_h = 0
            score_a = 0
            break
        # 前の試合節の得点を変数に格納
        if ranks.loc[IDs, home_team] in r'\d':
            home_team['score'] = ranks.loc[IDs, home_team['team']]
        elif ranks.loc[IDs, away_team] in r'\d':
            away_team['score'] = ranks.loc[IDs, away_team['team']]
            
    # 得点を足す
    for h, a in zip(home_team, away_team):
        # ranks表に勝ち点を追加していく
        ranks.loc[i, h] = home_score + sore_h
        ranks.loc[i, a] = away_score +score_a
            
    return

# 試合節ごとにinput_scoreに渡す
def sum_score(home_score, away_score, home_team, away_team, ID, match):
    
    return

In [18]:
# ランク付け

def ranks(match, home_team, away_team, home_score, away_score, ranking):
    # チーム表
    # 試合節ごとにループを回す
    for match in match:
        for index in range(len(match)):
            # チームに得点を追加する
            if match[index] != match[index+1]:
                break
        if match[index] > match[index+1]:
            break
            
            
# 試合節ごとのチームランクを返す
    return ranking
from scipy import stats

# ↓参考用のコピペ
test= np.array([2,4,3,5,8,8,16])
df=pd.DataFrame(test, columns=(['data']))
df = df.assign(ranking=len(df.data)-stats.mstats.rankdata(df.data)+1)#順位を作成してdfに追加　rankだとデフォルトのメソッドがあるのでrankingにしといた。
df[['ranking', 'data']].sort_values('data',ascending=False)#順位は左端につけたいのでSELECTしてdataで並び替え
df.loc[0, 'data'] = 3
df

Unnamed: 0,data,ranking
0,3,7.0
1,4,5.0
2,3,6.0
3,5,4.0
4,8,2.5
5,8,2.5
6,16,1.0


### 今回一番スコアがよかったのは、
- クロスバリデーション使用
- ステージ、収容人数、ホームチーム、アウェイチーム(チーム番号統一)、審判、試合日、天気、試合節、開始時間(一時間単位)
- 浦和レッズ(ホーム)のｙを -5000
#### でした。