# Testing out the data cleaning and transformation pipeline

In [1]:
import sys
import logging

import pandas as pd

from nba_survival.data.endpoints import (
    PlayByPlay,
    Scoreboard,
    TeamStats,
    GameRotation,
    TeamLineups,
    TeamGameLog
)
from nba_survival.data.core import gen_pipeline

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

## Load the data

### Team rotations

In [2]:
rotation = GameRotation(GameID="0021800006", output_dir="../nba-data/2018-19")
rotation.load()

homerotation = rotation.get_data("HomeTeam")
awayrotation = rotation.get_data("AwayTeam")

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/gamerotation/data_0021800006.json...


## Team game log

In [3]:
hlog = TeamGameLog(TeamID=1610612753, output_dir="../nba-data/2018-19")
vlog = TeamGameLog(TeamID=1610612748, output_dir="../nba-data/2018-19")
hlog.load()
vlog.load()

homegamelog = hlog.get_data()
visitorgamelog = vlog.get_data()

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamgamelog/data_1610612753.json...
INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamgamelog/data_1610612748.json...


### Lineup stats

In [4]:
awaylineup = TeamLineups(TeamID=1610612748, output_dir="../nba-data/2018-19")
homelineup = TeamLineups(TeamID=1610612753, output_dir="../nba-data/2018-19")
homelineup.load()
awaylineup.load()

homelineupstats = homelineup.get_data("Lineups")
awaylineupstats = awaylineup.get_data("Lineups")

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamdashlineups/data_1610612753.json...
INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamdashlineups/data_1610612748.json...


### Play by play

In [5]:
pbp = PlayByPlay(GameID="0021800006", output_dir="../nba-data/2018-19")
pbp.load()

pbp_df = pbp.get_data()

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/playbyplayv2/data_0021800006.json...


### Scoreboard data

In [6]:
scoreboard = Scoreboard(GameDate="10/17/2018", output_dir="../nba-data/2018-19")
scoreboard.load()

header = scoreboard.get_data("GameHeader")
last_meeting = scoreboard.get_data("LastMeeting")

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/scoreboardv2/data_20181017.json...


### Team-level stats

In [7]:
homestats = TeamStats(TeamID=1610612753, Season="2018-19", output_dir="../nba-data/2018-19")
awaystats = TeamStats(TeamID=1610612748, Season="2018-19", output_dir="../nba-data/2018-19")
homestats.load()
awaystats.load()

hometeamstats = homestats.get_data()
awayteamstats = awaystats.get_data()

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamestimatedmetrics/data_2018-19.json...
INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamestimatedmetrics/data_2018-19.json...


## Generate the pipeline

In [8]:
flow = gen_pipeline(
    pbp=pbp_df,
    header=header,
    stats=pd.concat([hometeamstats, awayteamstats]).reset_index(drop=True),
    last_meeting=last_meeting,
    gamelog=pd.concat([homegamelog, visitorgamelog]).reset_index(drop=True),
    lineup_stats=pd.concat([homelineupstats, awaylineupstats]).reset_index(drop=True),
    home_rotation=homerotation,
    away_rotation=awayrotation,
)

## Run the pipeline

In [9]:
output = flow.run()

[2021-01-31 05:47:04+0000] INFO - prefect.FlowRunner | Beginning Flow run for 'Transform raw NBA data'
INFO:prefect.FlowRunner:Beginning Flow run for 'Transform raw NBA data'
[2021-01-31 05:47:04+0000] INFO - prefect.TaskRunner | Task 'Add surival time': Starting task run...
INFO:prefect.TaskRunner:Task 'Add surival time': Starting task run...
[2021-01-31 05:47:04+0000] INFO - prefect.TaskRunner | Task 'Add surival time': Finished task run for task with final state: 'Success'
INFO:prefect.TaskRunner:Task 'Add surival time': Finished task run for task with final state: 'Success'
[2021-01-31 05:47:04+0000] INFO - prefect.TaskRunner | Task 'Backfill margin': Starting task run...
INFO:prefect.TaskRunner:Task 'Backfill margin': Starting task run...
[2021-01-31 05:47:04+0000] INFO - prefect.TaskRunner | Task 'Backfill margin': Finished task run for task with final state: 'Success'
INFO:prefect.TaskRunner:Task 'Backfill margin': Finished task run for task with final state: 'Success'
[2021-01-

In [10]:
final = output.result[flow.get_tasks(name="Add lineup plus minus")[0]].result

In [11]:
final.head()

Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,VISITORDESCRIPTION,...,HOME_W_PCT,VISITOR_W_PCT,HOME_GAMES_IN_LAST_3_DAYS,VISITOR_GAMES_IN_LAST_3_DAYS,HOME_GAMES_IN_LAST_5_DAYS,VISITOR_GAMES_IN_LAST_5_DAYS,HOME_GAMES_IN_LAST_7_DAYS,VISITOR_GAMES_IN_LAST_7_DAYS,HOME_LINEUP_PLUS_MINUS,VISITOR_LINEUP_PLUS_MINUS
0,21800006,2,12,0,1,7:17 PM,12:00,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,-1.0
1,21800006,4,10,0,1,7:17 PM,12:00,Jump Ball Vucevic vs. Whiteside: Tip to McGruder,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,-1.0
2,21800006,74,2,6,1,7:18 PM,11:45,,,MISS McGruder 7' Driving Layup,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,-1.0
3,21800006,75,4,0,1,7:18 PM,11:45,Gordon REBOUND (Off:0 Def:1),,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,-1.0
4,21800006,9,2,1,1,7:18 PM,11:35,MISS Fournier 25' 3PT Jump Shot,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,-1.0
