In [2]:
%load_ext autoreload
%autoreload 2


In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path
import seaborn as sns


In [4]:
import os
import sys

In [5]:
# chaging directory

In [6]:
root, *_ = !pwd

In [7]:
if root.endswith('notebooks'):
    os.chdir('../')
    root, *_ = !pwd
    sys.path.append('src/')

In [8]:
os.getcwd()

'/home/med/projects/competitions/mlb-kaggle'

In [9]:
from dotenv import load_dotenv
load_dotenv()

True

In [10]:
RAW_PATH = Path(os.environ['RAW_PATH'])

In [11]:
PROCESSED_PATH = Path(os.environ['ROOT_DIR']) / 'processed'

In [12]:
os.listdir(RAW_PATH)

['seasons.csv',
 'teams.csv',
 'example_sample_submission.csv',
 'train_updated.csv',
 'train.csv',
 'mlb-player-digital-engagement-forecasting.zip',
 'awards.csv',
 'players.csv',
 'example_test.csv',
 'mlb',
 'train_updated.csv.zip']

In [13]:
os.listdir(PROCESSED_PATH)

['target.csv', 'context_raw_data.csv', 'raw_data.csv']

In [14]:
raw_df = pd.read_csv(RAW_PATH / 'train_updated.csv',
                 parse_dates=['date'], nrows=150)

In [15]:
raw_df.set_index('date', inplace=True)

### teams 

In [45]:
from data.ingest_data.core import unpack_dataframe, has_duplicates, normalize_with_max

In [18]:
fields = unpack_dataframe(raw_df, fields={'teamBoxScores': 'teams'})

In [46]:
fields

{'teams':        home  teamId  gamePk    gameDate           gameTimeUTC  flyOuts  \
 0         1     109  529410  2018-03-29  2018-03-30T02:10:00Z        4   
 1         0     114  529409  2018-03-29  2018-03-30T02:10:00Z        4   
 2         1     121  529419  2018-03-29  2018-03-29T17:10:00Z        2   
 3         1     139  529406  2018-03-29  2018-03-29T20:00:00Z        2   
 4         1     140  529411  2018-03-29  2018-03-29T19:35:00Z        9   
 ...     ...     ...     ...         ...                   ...      ...   
 14481     0     139  633265  2021-07-16  2021-07-16T23:20:00Z        6   
 14482     1     144  633265  2021-07-16  2021-07-16T23:20:00Z        6   
 14483     1     145  633309  2021-07-16  2021-07-17T00:10:00Z        1   
 14484     1     113  633263  2021-07-16  2021-07-16T23:10:00Z        2   
 14485     1     120  633305  2021-07-16  2021-07-16T23:05:00Z        3   
 
        groundOuts  runsScored  doubles  triples  ...  balks  wildPitches  \
 0          

In [47]:
teams = fields['teams'].copy(deep=True)

In [48]:
team_stats_features = ['runsScored', 'homeRuns', 'strikeOuts', 'hits', 'runsPitching',
                       'homeRunsPitching', 'outsPitching','rbiPitching']

rank_features = ['runsScored', 'homeRuns', 'outsPitching']

def preprocess_teams(teams: pd.DataFrame):
    to_keep = ['teamId', 'date']
    # we drop teamId because this info is in roster]
    agg_teams = (teams.groupby(['teamId', 'date'])[team_stats_features]
                  .sum().reset_index())

    teams = (teams.drop_duplicates(subset=['teamId', 'date'], keep='last')
              .loc[:, to_keep])
    
    teams = teams.merge(agg_teams, on=['teamId', 'date'], how='left')
    assert not has_duplicates(teams, on=['teamId', 'date']), 'team stats include duplicates'
    return teams

In [63]:
def ingest_team_stats(teams: pd.DataFrame):
    teams = preprocess_teams(teams)
    teams = normalize_with_max(teams, on=['date'],
                                features=rank_features)
    return teams

def join_team_stats_to_pstats(pstats: pd.DataFrame,
                              teams: pd.DataFrame):
    features = teams.columns.drop(['date', 'teamId'])
    print(features)
    player_features = list('playerTeam' + features)
    away_features = list('opponentTeam' + features)
    
    # for the player team
    pstats = pstats.merge(teams, on=['teamId', 'date'], how='left')
    pstats.rename(columns={old: new
                           for old, new in zip(features, player_features)},
                  inplace=True)
    
    # for the away team
    teams.rename(columns={'teamId': 'opponentTeamId'}, inplace=True)
    pstats = pstats.merge(teams, on=['opponentTeamId', 'date'], how='left')
    pstats.rename(columns={old: new
                           for old, new in zip(features, away_features)},
                  inplace=True)
    
    return pstats

In [64]:
teams = ingest_team_stats(teams)

In [65]:
join_team_stats_to_pstats(teams , teams)

Index(['runsScored', 'homeRuns', 'strikeOuts', 'hits', 'runsPitching',
       'homeRunsPitching', 'outsPitching', 'rbiPitching',
       'runsScored__date__maxNorm', 'homeRuns__date__maxNorm',
       'outsPitching__date__maxNorm'],
      dtype='object')
['playerTeamrunsScored', 'playerTeamhomeRuns', 'playerTeamstrikeOuts', 'playerTeamhits', 'playerTeamrunsPitching', 'playerTeamhomeRunsPitching', 'playerTeamoutsPitching', 'playerTeamrbiPitching', 'playerTeamrunsScored__date__maxNorm', 'playerTeamhomeRuns__date__maxNorm', 'playerTeamoutsPitching__date__maxNorm'] ['opponentTeamrunsScored', 'opponentTeamhomeRuns', 'opponentTeamstrikeOuts', 'opponentTeamhits', 'opponentTeamrunsPitching', 'opponentTeamhomeRunsPitching', 'opponentTeamoutsPitching', 'opponentTeamrbiPitching', 'opponentTeamrunsScored__date__maxNorm', 'opponentTeamhomeRuns__date__maxNorm', 'opponentTeamoutsPitching__date__maxNorm']


### transactions

In [15]:
raw_df.columns

Index(['nextDayPlayerEngagement', 'games', 'rosters', 'playerBoxScores',
       'teamBoxScores', 'transactions', 'standings', 'awards', 'events',
       'playerTwitterFollowers', 'teamTwitterFollowers'],
      dtype='object')

In [None]:
from data.ingest_data.core import unpack_dataframe, has_duplicates, normalize_with_max

In [None]:
raw_df=raw_df.iloc[: 150]

In [None]:
fields = unpack_dataframe(raw_df, fields={'transactions': 'transactions'})

In [None]:
def ingest_transactions(transactions: pd.DataFrame):
    transactions = transactions.dropna(subset=['playerId'])
    transactions.loc[:, 'playerId'] = transactions.loc[:, 'playerId'].astype(np.int64)
    transactions = transactions.loc[:, ['playerId', 'toTeamId', 'typeDesc', 'date']]
    transactions.rename(columns={'toTeamId': 'MoveToTeamId',
                                 'typeDesc': 'transDesc'}, inplace=True)
    transactions.drop_duplicates(subset=['date', 'playerId'], keep='last', inplace=True)
    transactions.reset_index(drop=True, inplace=True)
    assert not has_duplicates(transactions), 'transactions has duplicates'
    return transactions

In [None]:
transactions = fields['transactions'].copy(deep=True)

In [29]:
transactions.set_index(['playerId', 'date']).loc[645302, '2018-09-04']

  transactions.set_index(['playerId', 'date']).loc[645302, '2018-09-04']


Unnamed: 0_level_0,Unnamed: 1_level_0,transactionId,playerName,fromTeamId,fromTeamName,toTeamId,toTeamName,effectiveDate,resolutionDate,typeCode,typeDesc,description
playerId,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
645302.0,2018-09-04,479110,Victor Robles,,,120,Washington Nationals,2018-09-04,2018-09-04,CU,Recalled,Washington Nationals recalled CF Victor Robles.
645302.0,2018-09-04,479252,Victor Robles,,,120,Washington Nationals,2018-09-04,2018-09-04,CU,Recalled,Washington Nationals recalled CF Victor Robles.
645302.0,2018-09-04,477011,Victor Robles,,,120,Washington Nationals,2018-09-04,2018-09-04,CU,Recalled,Washington Nationals recalled CF Victor Robles.
645302.0,2018-09-04,477923,Victor Robles,,,120,Washington Nationals,2018-09-04,2018-09-04,CU,Recalled,Washington Nationals recalled CF Victor Robles.
645302.0,2018-09-04,377804,Victor Robles,552.0,Syracuse Mets,120,Washington Nationals,2018-09-04,,CU,Recalled,Washington Nationals recalled CF Victor Robles...


In [30]:
transactions = ingest_transactions(transactions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


### events 

In [21]:
from data.ingest_data.core import unpack_dataframe, has_duplicates, normalize_with_max

In [22]:
fields = unpack_dataframe(raw_df, fields={'events': 'events'})

In [53]:
import gc

def get_first_item(x):
    assert len(x) == 1, 'the are multiple values'
    return x.iloc[0]

def ingest_events(events: pd.DataFrame):
    def concat_text(x):
        return ' EndEvent '.join(x)

    to_keep = ['date', 'hitterId', 'pitcherId', 'description']
    to_drop = [f for f in events.columns 
               if f not in to_keep]
    events.drop(to_drop, inplace=True, axis=1)
    gc.collect()
    events = events.dropna(subset=['description'])
    
    hitter_events = events.groupby(['date', 'hitterId'])['description'].apply(concat_text)
    pitcher_events = events.groupby(['date', 'pitcherId'])['description'].apply(concat_text)
    
    hitter_events = hitter_events.reset_index()
    pitcher_events = pitcher_events.reset_index()
    
    hitter_events.rename(columns={'hitterId': 'playerId'}, inplace=True)
    pitcher_events.rename(columns={'pitcherId': 'playerId'}, inplace=True)
    
    text_events = pd.concat([hitter_events, pitcher_events], ignore_index=True)
    text_events = text_events.groupby(['date', 'playerId'])['description'].apply(concat_text)
    
    return text_events.reset_index()
    

In [47]:
pd.options.display.max_rows=100
pd.options.display.max_columns=100

In [57]:
events = fields['events'].copy(deep=True)

In [58]:
text_events = ingest_events(events)

In [60]:
text_events

Unnamed: 0,date,playerId,description
0,2018-03-29,134181,Adrian Beltre flies out to center fielder Jake...
1,2018-03-29,400085,Ichiro Suzuki grounds out to first baseman Yon...
2,2018-03-29,400284,Offensive Substitution: Pinch-hitter Chase Utl...
3,2018-03-29,405395,Albert Pujols homers (1) on a fly ball to left...
4,2018-03-29,407822,Mound Visit. EndEvent Pitching Change: Jorge D...
...,...,...,...
23193,2018-05-30,660271,"With Jeimer Candelario batting, Leonys Martin ..."
23194,2018-05-30,664056,Defensive Substitution: Orlando Arcia replaces...
23195,2018-05-30,664068,Pat Venditte is now pitching right-handed. End...
23196,2018-05-30,664701,Pitching Change: Ben Taylor replaces Evan Mars...
