<a href="https://colab.research.google.com/github/gdollp/mlb/blob/main/lgbm_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook uses lightGBM to make predictions.

We use the following features
* playerId
* position
* teamId(rosters)
* status(rosters)
* playerBoxScores

and the date 20200401~20200431 as the validation data.

But I think there is room for improvement.  
If you have better ways, I would appreciate it if you could comment on it.

このnotebookではlightGBMを使って予測します。

特徴量は以下のものを使用しています。
* playerId
* position
* teamId(rosters)
* status(rosters)
* playerBoxScores

20200401~20200431を日時をvalidation dataとしていますが、一考の余地がありそうです。  
もし良さそうな方法があればコメントしていただけると幸いです。



In [37]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
#gamesを特徴量に追加


#基本指針 #新しい特徴量を考える　or 利用できていないデータセットの利用

## About Dataset

Train.csv is stored as a csv file with each column as follows.  

train.csvを以下のようにして各カラムをcsvファイルとして保管しています。

In [39]:
%%capture
"""
!pip install pandarallel 

import gc

import numpy as np
import pandas as pd
from pathlib import Path

from pandarallel import pandarallel
pandarallel.initialize()

BASE_DIR = Path('../input/mlb-player-digital-engagement-forecasting')
train = pd.read_csv(BASE_DIR / 'train.csv')

null = np.nan
true = True
false = False

for col in train.columns:

    if col == 'date': continue

    _index = train[col].notnull()
    train.loc[_index, col] = train.loc[_index, col].parallel_apply(lambda x: eval(x))

    outputs = []
    for index, date, record in train.loc[_index, ['date', col]].itertuples():
        _df = pd.DataFrame(record)
        _df['index'] = index
        _df['date'] = date
        outputs.append(_df)

    outputs = pd.concat(outputs).reset_index(drop=True)

    outputs.to_csv(f'{col}_train.csv', index=False)
    outputs.to_pickle(f'{col}_train.pkl')

    del outputs
    del train[col]
    gc.collect()
"""

## Training

In [40]:
#!pip install optuna

In [41]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import mean_absolute_error
from datetime import timedelta
from functools import reduce
from tqdm import tqdm
#import optuna.integration.lightgbm as lgbm
import lightgbm as lgbm
import pickle
#import mlb
import sklearn #機械学習のライブラリ
from sklearn.decomposition import PCA #主成分分析器
from sklearn.preprocessing import LabelEncoder
import gc

In [42]:
BASE_DIR = Path('/content/drive/MyDrive/mlb/input')
TRAIN_DIR = Path('/content/drive/MyDrive/mlb/input/archive')

In [43]:
players = pd.read_csv(BASE_DIR / 'players.csv')

rosters = pd.read_pickle(TRAIN_DIR / 'rosters_train.pkl')
targets = pd.read_pickle(TRAIN_DIR / 'nextDayPlayerEngagement_train.pkl')
scores1 = pd.read_pickle(TRAIN_DIR / 'playerBoxScores_train.pkl')
scores = scores1.groupby(['playerId', 'date']).sum().reset_index()
playertwitter = pd.read_pickle("/content/drive/MyDrive/mlb/input/archive/playerTwitterFollowers_train.pkl")
games = pd.read_pickle(TRAIN_DIR / 'games_train.pkl')
events = pd.read_pickle(TRAIN_DIR / 'events_train.pkl')
standings = pd.read_pickle(TRAIN_DIR / 'standings_train.pkl')
teamtwitter = pd.read_pickle(TRAIN_DIR / 'teamTwitterFollowers_train.pkl')
transaction = pd.read_pickle(TRAIN_DIR / 'teamTwitterFollowers_train.pkl')
awards = pd.read_csv(BASE_DIR / 'awards.csv')
seasons = pd.read_csv(BASE_DIR / 'seasons.csv')
teams = pd.read_csv(BASE_DIR / 'teams.csv')
player_target_stats = pd.read_csv(BASE_DIR /"player_target_stats.csv")


In [44]:
events_p = events
events_p["playerId"] = events_p["pitcherId"]
events_h = events
events_h["playerId"] = events_p["hitterId"]

In [45]:
awards2 = awards.groupby("playerId").count()
awards2 = awards2.reset_index()
awards2

Unnamed: 0,playerId,awardDate,awardSeason,awardId,awardName,playerName,awardPlayerTeamId
0,112526,12,12,12,12,12,12
1,134181,38,38,38,38,38,38
2,282332,24,24,24,24,24,24
3,400085,43,43,43,43,43,43
4,400121,20,20,20,20,20,20
...,...,...,...,...,...,...,...
1691,671250,1,1,1,1,1,1
1692,675912,2,2,2,2,2,2
1693,676701,1,1,1,1,1,1
1694,676709,1,1,1,1,1,1


In [46]:
targets_cols = ['playerId', 'target1', 'target2', 'target3', 'target4', 'date']
players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status', 'date']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances', 'date',"gamePk"]
games_cols = ["gamePk","homeId","dayNight","seriesDescription","gamesInSeries","homeWinner","awayWinner","homeScore","awayScore","gameType"]
playertwitter_cols = ["date","playerId","numberOfFollowers"]
awards_cols = ["playerId","awardName"]

feature_cols1 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',"ishome","label_seriesDescription","gamesInSeries","label_daynight","gameType"]#winorlose"]


feature_cols2 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',"ishome","label_seriesDescription","gamesInSeries","label_daynight","gameType","awardName",'target2_mean',
 'target2_median',
 'target2_std',
 'target2_min',
 'target2_max','target2_prob']#winorlose"]

feature_cols3 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',"ishome","label_seriesDescription","gamesInSeries","label_daynight","gameType","awardName",
 'target3_mean',
 'target3_median',
 'target3_std',
 'target3_min',
 'target3_max',
 'target3_prob',
]

       #winorlose"]

feature_cols4 = ['label_playerId', 'label_primaryPositionName', 'label_teamId',
       'label_status', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances',"ishome","label_seriesDescription","gamesInSeries","label_daynight","awardName",'target4_mean',
 'target4_median',
 'target4_std',
 'target4_min',
 'target4_max',
 'target4_prob']

In [47]:
# creat dataset
train = targets[targets_cols].merge(players[players_cols], on=['playerId'], how='left')
train = train.merge(rosters[rosters_cols], on=['playerId', 'date'], how='left')
train = train.merge(scores[scores_cols], on=['playerId', 'date'], how='left')
train = train.merge(games[games_cols], on=["gamePk"], how="left")
train = train.merge(playertwitter[playertwitter_cols], on=['playerId', 'date'], how='left')
#train = train.merge(events[events_cols], on=['gamePk', 'date'], how='left')
train = train.merge(awards2[awards_cols], on=['playerId'], how='left')
train = train.merge(player_target_stats, how='inner', left_on=["playerId"],right_on=["playerId"])

In [48]:
train["ongame"] = np.where(train["gamePk"].isnull() == 1,0,1)
train["ongame"].unique()
train["ishome"] = ""
train["ishome"] = np.where(train["teamId"]==train["homeId"],2,train["ongame"])
train["winorlose"] = np.where(train["teamId"]==train["homeId"],train["homeWinner"],train["awayWinner"])
train["winorlose"] = train["winorlose"].fillna(2.0).astype(int)
train["winorlose"].unique()
train["score"] = np.where(train["teamId"]==train["homeId"],train["homeScore"],train["awayScore"])

In [49]:


# label encoding
player2num = {c: i for i, c in enumerate(train['playerId'].unique())}
position2num = {c: i for i, c in enumerate(train['primaryPositionName'].unique())}
teamid2num = {c: i for i, c in enumerate(train['teamId'].unique())}
status2num = {c: i for i, c in enumerate(train['status'].unique())}
daynight2num = {c: i for i, c in enumerate(train['dayNight'].unique())}
seriesDescription2num = {c: i for i, c in enumerate(train['seriesDescription'].unique())}
gameType2num = {c: i for i, c in enumerate(train['gameType'].unique())}
train['label_playerId'] = train['playerId'].map(player2num)
train['label_primaryPositionName'] = train['primaryPositionName'].map(position2num)
train['label_teamId'] = train['teamId'].map(teamid2num)
train['label_status'] = train['status'].map(status2num)
train["label_daynight"] = train['dayNight'].map(daynight2num)
train["label_seriesDescription"] = train["seriesDescription"].map(seriesDescription2num)
train["gameType"] = train["gameType"].map(gameType2num)


In [50]:
#make feature


In [51]:
train_X1 = train[feature_cols1]
train_X2 = train[feature_cols2]
train_X3 = train[feature_cols3]
train_X4 = train[feature_cols4]

"""train_data = [train_X1,train_X2,train_X3,train_X4]

remove1 = []
remove2 = []
remove3 = []
remove4 = []
remove_col = [remove1,remove2,remove3,remove4]"""

train_y = train[['target1', 'target2', 'target3', 'target4']]
"""
for i in range(len(train_data)):
  for col in train_data[i].columns:
    if train_data[i][col].std() == 0:
        remove_col[i].append(col)
        train_data[i].drop(remove_col[i], axis=1, inplace=True) 
"""

'\nfor i in range(len(train_data)):\n  for col in train_data[i].columns:\n    if train_data[i][col].std() == 0:\n        remove_col[i].append(col)\n        train_data[i].drop(remove_col[i], axis=1, inplace=True) \n'

In [52]:
#train.to_csv("/content/drive/MyDrive/mlb/script/output/train.csv")


_index = (train['date'] < 20210401)

del train
gc.collect()


x_train1 = train_X1.loc[_index].reset_index(drop=True)
y_train = train_y.loc[_index].reset_index(drop=True)
x_valid1 = train_X1.loc[~_index].reset_index(drop=True)
y_valid = train_y.loc[~_index].reset_index(drop=True)

x_train2 = train_X2.loc[_index].reset_index(drop=True)
x_valid2 = train_X2.loc[~_index].reset_index(drop=True)
x_train3 = train_X3.loc[_index].reset_index(drop=True)
x_valid3 = train_X3.loc[~_index].reset_index(drop=True)
x_train4 = train_X4.loc[_index].reset_index(drop=True)
x_valid4 = train_X4.loc[~_index].reset_index(drop=True)

del train_X1,train_X2,train_X3,train_X4
gc.collect()

0

In [53]:
def fit_lgbm(x_train, y_train, x_valid, y_valid, params: dict=None, verbose=100):
    oof_pred = np.zeros(len(y_valid), dtype=np.float32)
    #model = lgbm.LGBMRegressor(**params)
    best_params, tuning_history = {}, []
    
    lgb_train = lgbm.Dataset(x_train, y_train)
    lgb_eval = lgbm.Dataset(x_valid, y_valid, reference=lgb_train)
    model = lgbm.train(params, 
        lgb_train, valid_sets=lgb_eval,  
        early_stopping_rounds=100,
        num_boost_round=1000,
        verbose_eval=50,
        #verbose=verbose,               
        )
    oof_pred = model.predict(x_valid)
    score = mean_absolute_error(oof_pred, y_valid)
    print('mae:', score)
    return oof_pred, model, score


# training lightgbm
params = {
    'boosting_type': 'gbdt',
    'objective': 'mae',
    'metric': "mae",
    'num_leaves': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

oof1, model1, score1 = fit_lgbm(
    x_train1, y_train['target1'],
    x_valid1, y_valid['target1'],
    params
)
file = '/content/drive/MyDrive/mlb/script/output/model1.pkl'
pickle.dump(model1, open(file, 'wb'))

oof2, model2, score2 = fit_lgbm(
    x_train2, y_train['target2'],
    x_valid2, y_valid['target2'],
    params
)
file = '/content/drive/MyDrive/mlb/script/output/model2.pkl'
pickle.dump(model2, open(file, 'wb'))
oof3, model3, score3 = fit_lgbm(
    x_train3, y_train['target3'],
    x_valid3, y_valid['target3'],
    params
)
file = '/content/drive/MyDrive/mlb/script/output/model3.pkl'
pickle.dump(model3, open(file, 'wb'))
oof4, model4, score4 = fit_lgbm(
    x_train4, y_train['target4'],
    x_valid4, y_valid['target4'],
    params
)
file = '/content/drive/MyDrive/mlb/script/output/model4.pkl'
pickle.dump(model4, open(file, 'wb'))
score = (score1+score2+score3+score4) / 4
print(f'score: {score}')

Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l1: 0.733751
[100]	valid_0's l1: 0.726875
[150]	valid_0's l1: 0.722664
[200]	valid_0's l1: 0.721619
[250]	valid_0's l1: 0.719805
[300]	valid_0's l1: 0.717297
[350]	valid_0's l1: 0.714977
[400]	valid_0's l1: 0.71299
[450]	valid_0's l1: 0.712241
[500]	valid_0's l1: 0.712146
[550]	valid_0's l1: 0.711227
[600]	valid_0's l1: 0.710108
[650]	valid_0's l1: 0.709487
[700]	valid_0's l1: 0.707625
[750]	valid_0's l1: 0.707
[800]	valid_0's l1: 0.705765
[850]	valid_0's l1: 0.705284
[900]	valid_0's l1: 0.704198
[950]	valid_0's l1: 0.703702
[1000]	valid_0's l1: 0.703575
Did not meet early stopping. Best iteration is:
[954]	valid_0's l1: 0.703565
mae: 0.7035653932664931
Training until validation scores don't improve for 100 rounds.
[50]	valid_0's l1: 1.44841
[100]	valid_0's l1: 1.44157
[150]	valid_0's l1: 1.4469
Early stopping, best iteration is:
[77]	valid_0's l1: 1.43814
mae: 1.4381386885144432
Training until validation sco

In [54]:
importance = pd.DataFrame(model1.feature_importance(), index=x_train1.columns, columns=['importance'])
pd.set_option('display.max_rows', 500)
importance

Unnamed: 0,importance
label_playerId,29829
label_primaryPositionName,9062
label_teamId,5147
label_status,1152
battingOrder,172
gamesPlayedBatting,1
flyOuts,0
groundOuts,1
runsScored,5
doubles,2


## Inference

In [55]:
"""players_cols = ['playerId', 'primaryPositionName']
rosters_cols = ['playerId', 'teamId', 'status']
scores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',
       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',
       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',
       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',
       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',
       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',
       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',
       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',
       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',
       'groundOutsPitching', 'runsPitching', 'doublesPitching',
       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',
       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',
       'hitByPitchPitching', 'atBatsPitching', 'caughtStealingPitching',
       'stolenBasesPitching', 'inningsPitched', 'saveOpportunities',
       'earnedRuns', 'battersFaced', 'outsPitching', 'pitchesThrown', 'balls',
       'strikes', 'hitBatsmen', 'balks', 'wildPitches', 'pickoffsPitching',
       'rbiPitching', 'gamesFinishedPitching', 'inheritedRunners',
       'inheritedRunnersScored', 'catchersInterferencePitching',
       'sacBuntsPitching', 'sacFliesPitching', 'saves', 'holds', 'blownSaves',
       'assists', 'putOuts', 'errors', 'chances']

null = np.nan
true = True
false = False

env = mlb.make_env() # initialize the environment
iter_test = env.iter_test() # iterator which loops over each date in test set

for (test_df, sample_prediction_df) in iter_test: # make predictions here
    
    sample_prediction_df = sample_prediction_df.reset_index(drop=True)
    
    # creat dataset
    sample_prediction_df['playerId'] = sample_prediction_df['date_playerId']\
                                        .map(lambda x: int(x.split('_')[1]))
    # Dealing with missing values
    if test_df['rosters'].iloc[0] == test_df['rosters'].iloc[0]:
        test_rosters = pd.DataFrame(eval(test_df['rosters'].iloc[0]))
    else:
        test_rosters = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in rosters.columns:
            if col == 'playerId': continue
            test_rosters[col] = np.nan
            
    if test_df['playerBoxScores'].iloc[0] == test_df['playerBoxScores'].iloc[0]:
        test_scores = pd.DataFrame(eval(test_df['playerBoxScores'].iloc[0]))
    else:
        test_scores = pd.DataFrame({'playerId': sample_prediction_df['playerId']})
        for col in scores.columns:
            if col == 'playerId': continue
            test_scores[col] = np.nan
    test_scores = test_scores.groupby('playerId').sum().reset_index()
    test = sample_prediction_df[['playerId']].copy()
    test = test.merge(players[players_cols], on='playerId', how='left')
    test = test.merge(test_rosters[rosters_cols], on='playerId', how='left')
    test = test.merge(test_scores[scores_cols], on='playerId', how='left')

    test['label_playerId'] = test['playerId'].map(player2num)
    test['label_primaryPositionName'] = test['primaryPositionName'].map(position2num)
    test['label_teamId'] = test['teamId'].map(teamid2num)
    test['label_status'] = test['status'].map(status2num)
    
    test_X = test[feature_cols]
    
    # predict
    pred1 = model1.predict(test_X)
    pred2 = model2.predict(test_X)
    pred3 = model3.predict(test_X)
    pred4 = model4.predict(test_X)
    
    # merge submission
    sample_prediction_df['target1'] = np.clip(pred1, 0, 100)
    sample_prediction_df['target2'] = np.clip(pred2, 0, 100)
    sample_prediction_df['target3'] = np.clip(pred3, 0, 100)
    sample_prediction_df['target4'] = np.clip(pred4, 0, 100)
    sample_prediction_df = sample_prediction_df.fillna(0.)
    del sample_prediction_df['playerId']
    
    env.predict(sample_prediction_df)"""

"players_cols = ['playerId', 'primaryPositionName']\nrosters_cols = ['playerId', 'teamId', 'status']\nscores_cols = ['playerId', 'battingOrder', 'gamesPlayedBatting', 'flyOuts',\n       'groundOuts', 'runsScored', 'doubles', 'triples', 'homeRuns',\n       'strikeOuts', 'baseOnBalls', 'intentionalWalks', 'hits', 'hitByPitch',\n       'atBats', 'caughtStealing', 'stolenBases', 'groundIntoDoublePlay',\n       'groundIntoTriplePlay', 'plateAppearances', 'totalBases', 'rbi',\n       'leftOnBase', 'sacBunts', 'sacFlies', 'catchersInterference',\n       'pickoffs', 'gamesPlayedPitching', 'gamesStartedPitching',\n       'completeGamesPitching', 'shutoutsPitching', 'winsPitching',\n       'lossesPitching', 'flyOutsPitching', 'airOutsPitching',\n       'groundOutsPitching', 'runsPitching', 'doublesPitching',\n       'triplesPitching', 'homeRunsPitching', 'strikeOutsPitching',\n       'baseOnBallsPitching', 'intentionalWalksPitching', 'hitsPitching',\n       'hitByPitchPitching', 'atBatsPitching'