In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path
import seaborn as sns

In [3]:
import os
import sys

In [4]:
# chaging directory

In [5]:
root, *_ = !pwd

In [6]:
if root.endswith('notebooks'):
    os.chdir('../')
    root, *_ = !pwd
    sys.path.append('src/')

In [7]:
os.getcwd()

'/home/med/projects/competitions/mlb-kaggle'

In [8]:
from dotenv import load_dotenv
load_dotenv()

True

In [9]:
RAW_PATH = Path(os.environ['RAW_PATH'])

In [10]:
PROCESSED_PATH = Path(os.environ['ROOT_DIR']) / 'processed'

In [11]:
os.listdir(RAW_PATH)

['seasons.csv',
 'teams.csv',
 'example_sample_submission.csv',
 'train_updated.csv',
 'train.csv',
 'mlb-player-digital-engagement-forecasting.zip',
 'awards.csv',
 'players.csv',
 'example_test.csv',
 'mlb',
 'train_updated.csv.zip']

In [12]:
os.listdir(PROCESSED_PATH)

['target.csv', 'context_raw_data.csv', 'raw_data.csv']

In [13]:
dates_cols = ['date', 'mlbDebutDate', 'DOB', 'rosterDate', 'pstatsDate']

In [14]:
df = pd.read_csv(PROCESSED_PATH / 'raw_data.csv',
                 parse_dates=dates_cols)

### players data

In [15]:
# not all players are in the private test set
# the players in the test set are saved in the players.csv file

In [16]:
_players = pd.read_csv(RAW_PATH / 'players.csv')

In [17]:
_players['playerForTestSetAndFuturePreds'].fillna(False, inplace=True)

In [18]:
player_in_test = _players.query('playerForTestSetAndFuturePreds')['playerId']

In [19]:
# player_in_test holds only the players in the test set

### filter players

In [20]:
from data.util import filter_by_date, filter_by_id

In [21]:
# # filter only test playersex
df = filter_by_id(df, player_in_test)

In [22]:
# if you want to run on a sample,
# change sample_size to the number of players in the sample
sample_size = None

In [23]:
if sample_size is not None and sample_size > 0:
    df['target_avg'] = df[[f'target{i}' for i in range(1, 5)]].mean(axis=1)
    playerList = df.groupby(['playerId'])['target_avg'].std().sort_values(ascending=False).reset_index()
    df = filter_by_id(df, playerList['playerId'].head(sample_size))
    df.drop('target_avg', axis=1, inplace=True)

In [24]:
df.playerId.nunique()

1187

In [25]:
df.head()

Unnamed: 0,playerId,target1,target2,target3,target4,date,home,positionName,pstatsDate,battingOrder,...,weekday,inSeason,seasonPart,DOB,mlbDebutDate,birthCountry,weight,primaryPositionName,height,playerBMI
0,405395,0.151872,21.449416,0.112952,22.990196,2018-01-01,,,NaT,,...,0,False,Offseason,1980-01-16,2001-04-02,Dominican Republic,106.59412,First Base,1.905,29.372661
1,405395,0.136406,10.978585,0.096403,15.669173,2018-01-02,,,NaT,,...,1,False,Offseason,1980-01-16,2001-04-02,Dominican Republic,106.59412,First Base,1.905,29.372661
2,405395,0.06446,9.22619,0.101464,23.172829,2018-01-03,,,NaT,,...,2,False,Offseason,1980-01-16,2001-04-02,Dominican Republic,106.59412,First Base,1.905,29.372661
3,405395,0.024217,10.170965,0.102448,35.359116,2018-01-04,,,NaT,,...,3,False,Offseason,1980-01-16,2001-04-02,Dominican Republic,106.59412,First Base,1.905,29.372661
4,405395,0.157556,4.294307,0.046517,12.347789,2018-01-05,,,NaT,,...,4,False,Offseason,1980-01-16,2001-04-02,Dominican Republic,106.59412,First Base,1.905,29.372661


In [26]:
list(df.columns)

['playerId',
 'target1',
 'target2',
 'target3',
 'target4',
 'date',
 'home',
 'positionName',
 'pstatsDate',
 'battingOrder',
 'groundOuts',
 'runsScored',
 'homeRuns',
 'strikeOuts',
 'baseOnBalls',
 'intentionalWalks',
 'hits',
 'hitByPitch',
 'stolenBases',
 'groundIntoTriplePlay',
 'plateAppearances',
 'totalBases',
 'rbi',
 'catchersInterference',
 'pickoffs',
 'gamesPlayedPitching',
 'gamesStartedPitching',
 'completeGamesPitching',
 'winsPitching',
 'lossesPitching',
 'groundOutsPitching',
 'runsPitching',
 'strikeOutsPitching',
 'atBatsPitching',
 'caughtStealingPitching',
 'inningsPitched',
 'saveOpportunities',
 'battersFaced',
 'outsPitching',
 'balks',
 'pickoffsPitching',
 'gamesFinishedPitching',
 'inheritedRunners',
 'catchersInterferencePitching',
 'sacBuntsPitching',
 'saves',
 'holds',
 'blownSaves',
 'assists',
 'putOuts',
 'chances',
 'SLG',
 'runsScored__date__maxNorm',
 'homeRuns__date__maxNorm',
 'hits__date__maxNorm',
 'SLG__date__maxNorm',
 'rbi__date__maxNor

### preprocessing with pipelines

In [27]:
from pipeline.core import FunctionTransformer, Pipeline, PdColumnTransformer, PdFeatureUnion
from pipeline.continuous import PdScaleNorm, FilterContinuousFeatures, MedianFillNaN
from pipeline.stats import StatisticGen, LagGen, FeaturesTable
from pipeline.categories import Categorify
from pipeline.season import join_season_info
from pipeline.players import join_players_info
from pipeline.core import forward_fill, gen_hardcoded_features, fillna

In [28]:
inplace_ffill_features = ['numberOfFollowers',
                          'teamFollowers',
                          'status',
                          'teamId',
                          'rosterDate',
                          'pstatsDate']

In [29]:
# transformer for filling nan values with past valid values
# transformer for generating some hardcoded features
inplace_fill_forward_tmf = FunctionTransformer(forward_fill,
                                               kw_args={'features': inplace_ffill_features,
                                                        'on': ['playerId', 'year']})
harcoded_feat_tmf = FunctionTransformer(gen_hardcoded_features)

In [30]:
# creating pipeline
ffill_and_gen_harcoded_feat = Pipeline([('inplace_fill_forward', inplace_fill_forward_tmf),
                                        ('hardcoded_feat', harcoded_feat_tmf)])

In [31]:
# features to use for target preprocessing
target_cols = ['target1', 'target2', 'target3', 'target4']
index_cols = ['date', 'playerId']
pp_cols = target_cols + index_cols

In [32]:
# categories encoding
categories = ['playerId', 'seasonPart', 'year',
              'primaryPositionName',
              
              'status', 'teamId', 'weekday',
              'opponentTeamId']

In [33]:
categories_tmf = PdColumnTransformer([(category, Categorify(add_nan=True), category)
                                       for category in categories])

In [34]:
# for the statistics, we will compute them and then resuse them
stats_gen_tmf = StatisticGen(stats=['mean', 'median', 'std'], windows=[30], drop_index=False)
target_stats_tmf = PdColumnTransformer([('gen_stats', stats_gen_tmf, pp_cols)])
target_stats_table = target_stats_tmf.fit_transform(df)


target_stats_table.head()

Unnamed: 0,playerId,date,target1__mean__30d,target1__median__30d,target1__std__30d,target2__mean__30d,target2__median__30d,target2__std__30d,target3__mean__30d,target3__median__30d,target3__std__30d,target4__mean__30d,target4__median__30d,target4__std__30d
0,405395,2018-01-01,0.151872,0.151872,,21.449417,21.449417,,0.112952,0.112952,,22.990196,22.990196,
1,405395,2018-01-02,0.144139,0.144139,0.010936,16.214001,16.214001,7.403996,0.104677,0.104677,0.011702,19.329685,19.329685,5.176745
2,405395,2018-01-03,0.11758,0.136406,0.046648,13.88473,10.978585,6.609544,0.103606,0.101464,0.00848,20.610733,22.990196,4.28049
3,405395,2018-01-04,0.094239,0.100433,0.060248,12.956289,10.574775,5.707194,0.103317,0.101956,0.006948,24.297829,23.081512,8.160501
4,405395,2018-01-05,0.106902,0.136406,0.059365,11.223893,10.170965,6.279732,0.091957,0.101464,0.026104,21.90782,22.990196,8.860362


In [35]:
## merge computed statistics
reuse_computed_stats = Pipeline([('merge_stats', FeaturesTable(target_stats_table, lags=[45], clip_max=True)),
                                 ('fillnan', FunctionTransformer(fillna)),
                                ])

In [36]:
# for continuous features, we will filter them and then fill nan values and normalize them

median_fillnan_columns = ['numberOfFollowers', 'teamFollowers',
                          'playerDebutAge', 'playerAge', 'playerTSinceDebut']
fillnan_median_tmf = MedianFillNaN(median_fillnan_columns)

continuous_feat_tmf = Pipeline([('get_cont_feat', FilterContinuousFeatures(ignore_features=categories +
                                                                           pp_cols +
                                                                           ['target_avg'])),
                                ('fillnan_median', fillnan_median_tmf),
                                ('normalize', PdScaleNorm()),
                                ('fillnan', FunctionTransformer(fillna, kw_args={'fill_value': -1})),
                               ])

In [37]:
from pipeline.core import FilterFeatures

In [38]:
player_tv_cont_features = FilterFeatures(['runsScored', 'homeRuns',
                                          'hits', 'SLG',
                                          'rbi',
                                          'plateAppearances',
                                          'runsPitching',
                                          'playerId', 'date']).fit(df)._features

In [39]:
player_tv_cont_features

['playerId',
 'date',
 'runsScored',
 'homeRuns',
 'hits',
 'plateAppearances',
 'rbi',
 'runsPitching',
 'SLG',
 'playerTeamrunsScored',
 'playerTeamhomeRuns',
 'playerTeamhits',
 'playerTeamrunsPitching',
 'opponentTeamrunsScored',
 'opponentTeamhomeRuns',
 'opponentTeamhits',
 'opponentTeamrunsPitching']

In [40]:
## moving stats for players stats
# get the rest of the continuous features
stats_gen_cont_features = StatisticGen(stats=['sum', 'max'],
                             windows=[7],
                             drop_index=True)
_mov_cont_feat_tmf = Pipeline([('fillnan', FunctionTransformer(fillna)),
                               ('cont_stats', stats_gen_cont_features),
                                ('normalize', PdScaleNorm()),
                                ])
mov_cont_feat_tmf = PdColumnTransformer([('cont', _mov_cont_feat_tmf, player_tv_cont_features)]
                                 )

In [41]:
# putting all features generation steps together
all_features_tmf = PdFeatureUnion([('stats', reuse_computed_stats), 
                                   ('cont_feat', continuous_feat_tmf), 
                                   ('cats', categories_tmf),
                                   ('mov_feat', mov_cont_feat_tmf),
                                  ])

In [42]:
# main pipeline with all steps 
pipeline = Pipeline([('core_features', ffill_and_gen_harcoded_feat),
                     ('features_generation', all_features_tmf)])

In [43]:
pipeline

Pipeline(steps=[('core_features',
                 Pipeline(steps=[('inplace_fill_forward',
                                  FunctionTransformer(func=<function forward_fill at 0x7fa6ae5719d0>,
                                                      kw_args={'features': ['numberOfFollowers',
                                                                            'teamFollowers',
                                                                            'status',
                                                                            'teamId',
                                                                            'rosterDate',
                                                                            'pstatsDate'],
                                                               'on': ['playerId',
                                                                      'year']})),
                                 ('hardcoded_feat',
                                  FunctionTrans

### modeling 

In [44]:
from omegaconf import OmegaConf
hp = {'epochs': 10,
      'batch_size': 512,
      'max_emb_sz': 20,
      'encoder_dim': 512,
      'depth': 2,
      'emb_dropout': 0.5,
      'feature_reduction_dim': 32,
      'dropout': 0.5,
      'decrease_factor': 0.5,
      'drop_decrease_factor': 1.,
      'lr': 0.001,
      'wd': 3e-5,
      'scale_output': True}

In [45]:
from train.cont_emb_stack import run_emb_model_fn
from evaluate.metrics import compute_metrics

  rank_zero_deprecation(


In [46]:
def filter_by_idx(df, idx):
    return df.loc[idx, :].reset_index(drop=True)
periods=['Reg Season 1st Half',
                             'All-Star Break',
                             'Reg Season 2nd Half',
                             'Postseason']
def permute_dataset(df, 
                    reduce_size: int = 0.2,
                    increase_size: int = 2.5,
                    periods=periods, seed=2):
    np.random.seed(seed)
    to_keep = df.seasonPart.isin(periods)
    to_sample = ~to_keep
    
    sample_df  = df.loc[to_sample, :].sample(frac=reduce_size,
                                             random_state=seed)
    outputX = df.loc[to_keep, :]
    increase_df = outputX.sample(frac=increase_size, random_state=seed, replace=True)
    
    outputX = pd.concat([sample_df, outputX, increase_df])
    outputX.sort_values(by=['playerId', 'date'], inplace=True)
    return outputX.index

In [47]:
def check_categories(train_data, valid_data, categories):
    for cat in categories:
        tcat = train_data[cat].unique()
        tval = valid_data[cat].unique()
        assert np.isin(tval, tcat).all(), cat

In [48]:
# train_data = permute_dataset(train_data)

In [49]:
# valid_data[categories]

In [None]:
test_dates = ['2021-04-01', '2021-05-01',
              '2021-06-01', '2021-07-01',
              '2021-08-01']
output = []

cv_prediction = []
for start_test_date, end_test_date in zip(test_dates[:-1], test_dates[1:]):
    train_idx = df['date'] < start_test_date
    valid_idx = df['date'].between(start_test_date, end_test_date, inclusive=False)
    
    print(f'evaluating from {start_test_date} to {end_test_date}')
    
    print('training pipeline')
    train_data = df.loc[train_idx, :]
    train_idx = permute_dataset(train_data)
    pipeline.fit(filter_by_idx(df, train_idx))
    print('executing pipeline')
    features = pipeline.transform(df)
    cont_features = features.columns.drop(categories).to_list()
    
    
    train_data = filter_by_idx(features, train_idx)
    valid_data = filter_by_idx(features, valid_idx)
    
    # add the target to features ds
    train_data[target_cols + ['date']] = df.loc[train_idx, target_cols + ['date']].to_numpy()
    valid_data[target_cols + ['date']] = df.loc[valid_idx, target_cols + ['date']].to_numpy()
    
    check_categories(train_data, valid_data, categories)
    config_hp = {'hp': hp,
                 'seed': 2021,
                 'features': cont_features,
                 'categories': categories,
                 'target_cols': target_cols}
    config_hp = OmegaConf.create(config_hp)
    fold_output = run_emb_model_fn(config_hp, train_data, valid_data)
    
    _prediction = fold_output.prediction[:]

    fold_prediction = pd.DataFrame(_prediction, columns=['yhat1', 'yhat2', 'yhat3', 'yhat4'])
    
    fold_prediction[pp_cols] = df.loc[valid_idx, pp_cols].to_numpy()
    output.append(fold_output)
    cv_prediction.append(fold_prediction)
    print('score')
    print(compute_metrics(fold_prediction))

cv_prediction = pd.concat(cv_prediction, axis=0, ignore_index=True)

evaluating from 2021-04-01 to 2021-05-01
training pipeline
executing pipeline


GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name         | Type           | Params
------------------------------------------------
0 | emb          | EmbeddingLayer | 24.3 K
1 | output_layer | Sequential     | 230 K 
------------------------------------------------
254 K     Trainable params
0         Non-trainable params
254 K     Total params
1.018     Total estimated model params size (MB)


RegressionEmbModel(
  (emb): EmbeddingLayer(
    (dropout): Dropout(p=0.5, inplace=False)
    (emb): ModuleDict(
      (playerId): Embedding(1187, 20)
      (seasonPart): Embedding(7, 3)
      (year): Embedding(4, 2)
      (primaryPositionName): Embedding(9, 3)
      (status): Embedding(14, 4)
      (teamId): Embedding(31, 6)
      (weekday): Embedding(7, 3)
      (opponentTeamId): Embedding(33, 6)
    )
  )
  (output_layer): Sequential(
    (0): Linear(in_features=190, out_features=512, bias=True)
    (1): LeakyReLU(negative_slope=0.2)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): LeakyReLU(negative_slope=0.2)
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=256, out_features=4, bias=True)
  )
)


Validation sanity check: 0it [00:00, ?it/s]

  value = torch.tensor(value, device=device, dtype=torch.float)


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

loading model to best score
best score = 1.5207170248031616
score
{'mae_mean': 1.5207169597575627, 'mae_1': 1.1763733763852404, 'mae_2': 2.3430143461165542, 'mae_3': 0.9374234175738508, 'mae_4': 1.6260566989546057}
evaluating from 2021-05-01 to 2021-06-01
training pipeline


In [None]:
# from train.core import Ensemble, predict_recursive
# ensemble = Ensemble([output], pipeline)

In [None]:
# %%time
# raw_train_fr = raw_train_df[df.columns].copy()

# recursive_prediction = []
# for date, val_date_df in raw_valid_df.groupby('date'):
#     val_date_df = val_date_df[df.columns].reset_index(drop=True)
#     _recurive_pred, raw_train_fr = predict_recursive(val_date_df,
#                                                      raw_train_fr,
#                                                      ensemble,
#                                                      n_days=90 + 60 + 10,
#                                                      target_cols=target_cols)
#     recursive_prediction.append(_recurive_pred)

# recursive_prediction = pd.concat(recursive_prediction, ignore_index=True)
# recursive_prediction.sort_values(['playerId', 'date'], inplace=True)
# recursive_prediction.reset_index(drop=True, inplace=True)
# recursive_prediction.rename(columns={f: f.replace('target', 'yhat')
#                                      for f in target_cols}, inplace=True)
# recursive_prediction[target_cols] = raw_valid_df[target_cols].to_numpy()

In [None]:
# last_mean_prediction = (raw_valid_df.set_index('date').groupby(['playerId'])[target_cols]
#                         .rolling(window=7, min_periods=1).mean().reset_index())
# last_mean_prediction

# hardest_ids = [660271]
# hardest_idx = raw_valid_df['playerId'].isin(hardest_ids)

# yhat_cols = ['yhat1', 'yhat2', 'yhat3', 'yhat4']
# valid_test_prediction_with_last_mean = valid_test_prediction.copy(deep=True)
# valid_test_prediction_with_last_mean.loc[hardest_idx, yhat_cols] = last_mean_prediction.loc[hardest_idx, target_cols].to_numpy()

# valid_test_prediction_with_last_mean

In [None]:
# valid_test_prediction.loc[hardest_idx].head()

In [None]:
# valid_test_prediction_with_last_mean.loc[hardest_idx].head()

In [None]:
# raw_cv_prediction = cv_prediction.copy(deep=True)

In [None]:
cv_prediction = filter_by_id(cv_prediction, player_in_test)

In [None]:
# metrics from 2021-03-01 up to 2021-04-30
compute_metrics(cv_prediction)

In [None]:
# # metrics from 2021-03-01 up to 2021-04-30
# compute_metrics(valid_test_prediction_with_last_mean)

In [None]:
(cv_prediction.groupby(cv_prediction.date.dt.month)
 .apply(lambda x: pd.Series(compute_metrics(x))))

In [None]:
# (valid_test_prediction_with_last_mean.groupby(valid_test_prediction.date.dt.month)
#  .apply(lambda x: pd.Series(compute_metrics(x))))

In [None]:
# (april_set_with_last_mean.groupby(['playerId'])
#  .apply(lambda x: pd.Series(compute_metrics(x)))
#  .sort_values(by='mae_mean', ascending=False))

In [None]:
(cv_prediction.groupby(['playerId'])
 .apply(lambda x: pd.Series(compute_metrics(x)))
 .sort_values(by='mae_mean', ascending=False))

In [None]:
hardest_id = filter_by_id(cv_prediction, [660271])

In [None]:
(hardest_id.groupby(['playerId', hardest_id.date.dt.month])
 .apply(lambda x: pd.Series(compute_metrics(x)))
 .sort_values(by='mae_mean', ascending=False))

In [None]:
# # metrics from 2021-03-01 up to 2021-04-30
# compute_metrics(recursive_prediction)

In [None]:
# # metrics from 2021-03-01 up to 2021-03-31
# compute_metrics(valid_rc_prediction)

In [None]:
# # metrics from 2021-04-01 up to 2021-04-30
# compute_metrics(test_rc_prediction)

In [None]:
def plot_pred(df, playerId, index=1):
    pdf = df.query(f'playerId == {playerId}')
    
    f, ax = plt.subplots(1, 1, figsize=(20, 5))
    
    tname = f'target{index}'
    yhat_name = f'yhat{index}'
    
    ax.plot('date', tname, '-o', data=pdf, label=tname)
    ax.plot('date', yhat_name, '-o', data=pdf, label=tname)
    
    ax.legend()
    

def plot_feature_vs_target(df, feature, playerId, index=1):
    pdf = df.query(f'raw_playerId == {playerId}')
    
    f, ax = plt.subplots(1, 1, figsize=(20, 5))
    
    tname = f'target{index}'
    twinx = ax.twinx()
    ax.plot('date', tname, '-o', data=pdf, label=tname)
    twinx.plot('date', feature, '-o', data=pdf, label=feature, color='red')
    twinx.legend()
    ax.set_title(f'{tname} vs {feature}')
    

In [None]:
for index in range(1, 5):
    plot_pred(hardest_id, playerId=660271, index=index)

In [None]:
import numpy as np
import pandas as pd


def permutation_importance(predict_fn, data,
                           features,
                           target,
                           score_func,
                           times: int = 1):

    def _score(data):
        _prediction = predict_fn(data)
        prediction = data[target].copy()
        prediction[['yhat1', 'yhat2', 'yhat3', 'yhat4']] = _prediction
        error = score_func(prediction)
        return error['mae_mean']

    base_score = _score(data)
    fi = []

    for feature in features:
        permuted_data = data.copy()
        permuted_data[feature] = np.random.permutation(permuted_data[feature])
        feature_score = _score(permuted_data)
        feature_importance = {'feature': feature,
                              'score': feature_score,
                              'importance': feature_score-base_score,
                              }
        fi.append(feature_importance)
    fi = pd.DataFrame(fi)
    fi['importance'] *= 100
    fi.sort_values(by='importance', inplace=True, ascending=False)
    fi.reset_index(drop=True, inplace=True)
    return fi

In [None]:
fi_data = valid_data

In [None]:
fi_data = train_data.sample(frac=0.2).reset_index(drop=True)

In [None]:
all_features_list = list(features.columns)

In [None]:
%%time
fi_reg = permutation_importance(output.predict_fn, valid_data, 
                                all_features_list, target=target_cols, 
                                score_func=compute_metrics,
                                times=5)

In [None]:
fi_reg

In [None]:
valid_data['raw_playerId'] = raw_valid_df['playerId']

In [None]:
for index in range(1, 5):
    plot_feature_vs_target(valid_data, 'playerTeamoutsPitching__date__maxNorm', playerId=660271, index=index)

In [None]:
output_path = 'prediction/drop_lstm'
os.makedirs(output_path, exist_ok=True)

In [None]:
# valid_test_prediction.to_csv(os.path.join(output_path, 'april_to_jul.csv'), index=False)