# Testing out the data cleaning and transformation pipeline

In [1]:
import sys
import logging

import pandas as pd

from nba_survival.data.endpoints import (
    PlayByPlay,
    Scoreboard,
    TeamStats,
    GameRotation,
    TeamLineups,
    TeamGameLog
)
from nba_survival.data.core import gen_pipeline

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

## Load the data

### Team rotations

In [2]:
rotation = GameRotation(GameID="0021800359", output_dir="../nba-data/2018-19")
rotation.load()

homerotation = rotation.get_data("HomeTeam")
awayrotation = rotation.get_data("AwayTeam")

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/gamerotation/data_0021800359.json...


In [3]:
homerotation.head(n=10)

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_CITY,TEAM_NAME,PERSON_ID,PLAYER_FIRST,PLAYER_LAST,IN_TIME_REAL,OUT_TIME_REAL,PLAYER_PTS,PT_DIFF,USG_PCT
0,21800359,1610612761,Toronto,Raptors,101139,CJ,Miles,5880.0,7200.0,0,-2.0,0.0
1,21800359,1610612761,Toronto,Raptors,101139,CJ,Miles,7550.0,9490.0,0,2.0,0.077
2,21800359,1610612761,Toronto,Raptors,101139,CJ,Miles,22260.0,25190.0,0,8.0,0.083
3,21800359,1610612761,Toronto,Raptors,200768,Kyle,Lowry,0.0,4770.0,0,-6.0,0.2
4,21800359,1610612761,Toronto,Raptors,200768,Kyle,Lowry,7200.0,18990.0,4,8.0,0.138
5,21800359,1610612761,Toronto,Raptors,200768,Kyle,Lowry,25190.0,28268.0,3,3.0,0.091
6,21800359,1610612761,Toronto,Raptors,201586,Serge,Ibaka,0.0,4770.0,4,-6.0,0.3
7,21800359,1610612761,Toronto,Raptors,201586,Serge,Ibaka,10230.0,20100.0,14,2.0,0.217
8,21800359,1610612761,Toronto,Raptors,201980,Danny,Green,0.0,5880.0,0,-4.0,0.087
9,21800359,1610612761,Toronto,Raptors,201980,Danny,Green,10650.0,20410.0,5,6.0,0.152


## Team game log

In [4]:
hlog = TeamGameLog(TeamID=1610612761, output_dir="../nba-data/2018-19")
vlog = TeamGameLog(TeamID=1610612755, output_dir="../nba-data/2018-19")
hlog.load()
vlog.load()

homegamelog = hlog.get_data()
visitorgamelog = vlog.get_data()

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamgamelog/data_1610612761.json...
INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamgamelog/data_1610612755.json...


In [5]:
homegamelog.head(n=10)

Unnamed: 0,Team_ID,Game_ID,GAME_DATE,MATCHUP,WL,W,L,W_PCT,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS
0,1610612761,21801214,"APR 09, 2019",TOR @ MIN,W,58,24,0.707,240,46,...,0.706,7,47,54,24,10,5,18,17,120
1,1610612761,21801195,"APR 07, 2019",TOR vs. MIA,W,57,24,0.704,265,39,...,0.852,4,43,47,26,7,5,11,17,117
2,1610612761,21801180,"APR 05, 2019",TOR @ CHA,L,56,24,0.7,240,43,...,0.8,5,38,43,27,2,5,14,17,111
3,1610612761,21801169,"APR 03, 2019",TOR @ BKN,W,56,23,0.709,240,44,...,0.857,17,35,52,29,9,1,7,16,115
4,1610612761,21801156,"APR 01, 2019",TOR vs. ORL,W,55,23,0.705,240,42,...,0.818,6,33,39,31,9,8,14,20,121
5,1610612761,21801142,"MAR 30, 2019",TOR @ CHI,W,54,23,0.701,240,46,...,0.778,6,41,47,35,10,4,11,15,124
6,1610612761,21801125,"MAR 28, 2019",TOR @ NYK,W,53,23,0.697,240,40,...,0.773,6,38,44,32,8,6,8,28,117
7,1610612761,21801110,"MAR 26, 2019",TOR vs. CHI,W,52,23,0.693,240,42,...,1.0,2,37,39,28,13,7,15,16,112
8,1610612761,21801098,"MAR 24, 2019",TOR vs. CHA,L,51,23,0.689,240,45,...,1.0,3,32,35,29,6,11,13,17,114
9,1610612761,21801083,"MAR 22, 2019",TOR vs. OKC,L,51,22,0.699,240,39,...,0.857,11,35,46,20,7,7,21,24,109


### Lineup stats

In [6]:
awaylineup = TeamLineups(TeamID=1610612755, output_dir="../nba-data/2018-19")
homelineup = TeamLineups(TeamID=1610612761, output_dir="../nba-data/2018-19")
homelineup.load()
awaylineup.load()

homelineupstats = homelineup.get_data("Lineups")
awaylineupstats = awaylineup.get_data("Lineups")

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamdashlineups/data_1610612761.json...
INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamdashlineups/data_1610612755.json...


In [7]:
homelineupstats.head(n=10)

Unnamed: 0,GROUP_SET,GROUP_ID,GROUP_NAME,GP,W,L,W_PCT,MIN,FGM,FGA,...,REB_RANK,AST_RANK,TOV_RANK,STL_RANK,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK
0,Lineups,-200768-201586-201980-1627775-1627783-,K. Lowry - S. Ibaka - D. Green - P. McCaw - P....,1,1,0,1.0,20.3,21.0,32.0,...,1,1,763,11,1,1,754,3,1,1
1,Lineups,-200768-201586-201980-202695-1627783-,K. Lowry - S. Ibaka - D. Green - K. Leonard - ...,33,24,9,0.727,16.6,15.1,31.1,...,2,4,762,6,12,748,760,2,3,66
2,Lineups,-200768-201188-201980-1627783-1627832-,K. Lowry - M. Gasol - D. Green - P. Siakam - F...,2,2,0,1.0,14.9,16.5,29.0,...,5,2,732,2,4,693,763,4,2,2
3,Lineups,-200768-201188-201586-201980-1627832-,K. Lowry - M. Gasol - S. Ibaka - D. Green - F....,1,1,0,1.0,12.7,13.0,24.0,...,7,3,752,11,189,693,740,12,5,10
4,Lineups,-200768-201980-202685-202695-1627783-,K. Lowry - D. Green - J. Valanciunas - K. Leon...,12,9,3,0.75,11.7,11.4,21.6,...,8,8,750,10,91,687,753,6,6,26
5,Lineups,-200768-201188-201980-202695-1627783-,K. Lowry - M. Gasol - D. Green - K. Leonard - ...,14,10,4,0.714,11.5,11.1,20.8,...,12,6,735,9,90,676,734,11,7,59
6,Lineups,-201586-201980-202695-1627783-1627832-,S. Ibaka - D. Green - K. Leonard - P. Siakam -...,17,13,4,0.765,11.0,9.8,19.2,...,10,13,734,130,18,686,752,10,8,58
7,Lineups,-201975-202391-1626181-1628384-1628449-,J. Meeks - J. Lin - N. Powell - O. Anunoby - C...,1,1,0,1.0,10.9,14.0,23.0,...,3,5,698,11,22,763,761,12,4,3
8,Lineups,-200768-201980-202328-1627783-1628384-,K. Lowry - D. Green - G. Monroe - P. Siakam - ...,1,0,1,0.0,10.7,8.0,21.0,...,11,40,752,249,189,762,710,1,11,711
9,Lineups,-201586-202391-1626181-1627832-1628384-,S. Ibaka - J. Lin - N. Powell - F. VanVleet - ...,1,1,0,1.0,10.0,7.0,18.0,...,13,14,698,42,4,1,710,12,15,630


### Play by play

In [8]:
pbp = PlayByPlay(GameID="0021800359", output_dir="../nba-data/2018-19")
pbp.load()

pbp_df = pbp.get_data()

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/playbyplayv2/data_0021800359.json...


In [9]:
pbp_df.head(n=10)

Unnamed: 0,GAME_ID,EVENTNUM,EVENTMSGTYPE,EVENTMSGACTIONTYPE,PERIOD,WCTIMESTRING,PCTIMESTRING,HOMEDESCRIPTION,NEUTRALDESCRIPTION,VISITORDESCRIPTION,...,PLAYER2_TEAM_NICKNAME,PLAYER2_TEAM_ABBREVIATION,PERSON3TYPE,PLAYER3_ID,PLAYER3_NAME,PLAYER3_TEAM_ID,PLAYER3_TEAM_CITY,PLAYER3_TEAM_NICKNAME,PLAYER3_TEAM_ABBREVIATION,VIDEO_AVAILABLE_FLAG
0,21800359,2,12,0,1,8:16 PM,12:00,,,,...,,,0,0,,,,,,0
1,21800359,4,10,0,1,8:16 PM,12:00,Jump Ball Ibaka vs. Embiid: Tip to Simmons,,,...,76ers,PHI,5,1627732,Ben Simmons,1610613000.0,Philadelphia,76ers,PHI,1
2,21800359,8,1,58,1,8:17 PM,11:38,,,Embiid 8' Turnaround Hook Shot (2 PTS),...,,,0,0,,,,,,1
3,21800359,9,2,1,1,8:17 PM,11:24,MISS Lowry 27' 3PT Jump Shot,,,...,,,0,0,,,,,,1
4,21800359,10,4,0,1,8:17 PM,11:22,,,Embiid REBOUND (Off:0 Def:1),...,,,0,0,,,,,,1
5,21800359,11,2,2,1,8:17 PM,11:19,,,MISS Redick 25' 3PT Running Jump Shot,...,,,0,0,,,,,,1
6,21800359,12,4,0,1,8:17 PM,11:16,Leonard REBOUND (Off:0 Def:1),,,...,,,0,0,,,,,,1
7,21800359,13,2,79,1,8:18 PM,10:53,MISS Ibaka 21' Pullup Jump Shot,,,...,,,0,0,,,,,,1
8,21800359,15,4,0,1,8:18 PM,10:49,Siakam REBOUND (Off:1 Def:0),,,...,,,0,0,,,,,,1
9,21800359,16,2,72,1,8:18 PM,10:48,MISS Siakam 2' Putback Layup,,,...,,,0,0,,,,,,1


### Scoreboard data

In [10]:
scoreboard = Scoreboard(GameDate="12/05/2018", output_dir="../nba-data/2018-19")
scoreboard.load()

header = scoreboard.get_data("GameHeader")
last_meeting = scoreboard.get_data("LastMeeting")

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/scoreboardv2/data_20181205.json...


In [11]:
header.head(n=10)

Unnamed: 0,GAME_DATE_EST,GAME_SEQUENCE,GAME_ID,GAME_STATUS_ID,GAME_STATUS_TEXT,GAMECODE,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,LIVE_PERIOD,LIVE_PC_TIME,NATL_TV_BROADCASTER_ABBREVIATION,HOME_TV_BROADCASTER_ABBREVIATION,AWAY_TV_BROADCASTER_ABBREVIATION,LIVE_PERIOD_TIME_BCAST,ARENA_NAME,WH_STATUS
0,2018-12-05T00:00:00,1,21800355,3,Final,20181205/GSWCLE,1610612739,1610612744,2018,4,,,FSO,NBCSBA,Q4 -,Quicken Loans Arena,1
1,2018-12-05T00:00:00,2,21800356,3,Final,20181205/DENORL,1610612753,1610612743,2018,5,,,FSFL,ALT,Q5 -,Amway Center,1
2,2018-12-05T00:00:00,3,21800357,3,Final,20181205/WASATL,1610612737,1610612764,2018,4,,,FSSE-ATL,NBCSWA,Q4 -,State Farm Arena,1
3,2018-12-05T00:00:00,4,21800358,3,Final,20181205/OKCBKN,1610612751,1610612760,2018,4,,,YES,FSOK,Q4 -,Barclays Center,1
4,2018-12-05T00:00:00,5,21800359,3,Final,20181205/PHITOR,1610612761,1610612755,2018,4,,ESPN,SN1+RDS2,NBCSP,Q4 - ESPN,Scotiabank Arena,1
5,2018-12-05T00:00:00,6,21800360,3,Final,20181205/LACMEM,1610612763,1610612746,2018,4,,,FSSE-MEM,FSPT,Q4 -,FedExForum,1
6,2018-12-05T00:00:00,7,21800361,3,Final,20181205/DETMIL,1610612749,1610612765,2018,4,,,FSWIS,FSD,Q4 -,Fiserv Forum,1
7,2018-12-05T00:00:00,8,21800362,3,Final,20181205/CHAMIN,1610612750,1610612766,2018,4,,,FSNTH,FSSE-CHA,Q4 -,Target Center,1
8,2018-12-05T00:00:00,9,21800363,3,Final,20181205/DALNOP,1610612740,1610612742,2018,4,,,FSNO,FSSW-DAL,Q4 -,Smoothie King Center,1
9,2018-12-05T00:00:00,10,21800364,3,Final,20181205/SASLAL,1610612747,1610612759,2018,4,,ESPN,SPECSN,,Q4 - ESPN,STAPLES Center,1


In [12]:
last_meeting.head(n=10)

Unnamed: 0,GAME_ID,LAST_GAME_ID,LAST_GAME_DATE_EST,LAST_GAME_HOME_TEAM_ID,LAST_GAME_HOME_TEAM_CITY,LAST_GAME_HOME_TEAM_NAME,LAST_GAME_HOME_TEAM_ABBREVIATION,LAST_GAME_HOME_TEAM_POINTS,LAST_GAME_VISITOR_TEAM_ID,LAST_GAME_VISITOR_TEAM_CITY,LAST_GAME_VISITOR_TEAM_NAME,LAST_GAME_VISITOR_TEAM_CITY1,LAST_GAME_VISITOR_TEAM_POINTS
0,21800355,41700404,2018-06-08T00:00:00,1610612739,Cleveland,Cavaliers,CLE,85,1610612744,Golden State,Warriors,GSW,108
1,21800356,21800276,2018-11-23T00:00:00,1610612753,Orlando,Magic,ORL,87,1610612743,Denver,Nuggets,DEN,112
2,21800357,21701184,2018-04-06T00:00:00,1610612737,Atlanta,Hawks,ATL,103,1610612764,Washington,Wizards,WAS,97
3,21800358,21700697,2018-01-23T00:00:00,1610612751,Brooklyn,Nets,BKN,108,1610612760,Oklahoma City,Thunder,OKC,109
4,21800359,21800100,2018-10-30T00:00:00,1610612761,Toronto,Raptors,TOR,129,1610612755,Philadelphia,76ers,PHI,112
5,21800360,21800266,2018-11-23T00:00:00,1610612763,Memphis,Grizzlies,MEM,107,1610612746,LA,Clippers,LAC,112
6,21800361,21700913,2018-02-28T00:00:00,1610612749,Milwaukee,Bucks,MIL,87,1610612765,Detroit,Pistons,DET,110
7,21800362,21700241,2017-11-20T00:00:00,1610612750,Minnesota,Timberwolves,MIN,102,1610612766,Charlotte,Hornets,CHA,118
8,21800363,21701061,2018-03-20T00:00:00,1610612740,New Orleans,Pelicans,NOP,115,1610612742,Dallas,Mavericks,DAL,105
9,21800364,21800082,2018-10-27T00:00:00,1610612747,Los Angeles,Lakers,LAL,106,1610612759,San Antonio,Spurs,SAS,110


### Team-level stats

In [13]:
homestats = TeamStats(TeamID=1610612761, Season="2018-19", output_dir="../nba-data/2018-19")
awaystats = TeamStats(TeamID=1610612755, Season="2018-19", output_dir="../nba-data/2018-19")
homestats.load()
awaystats.load()

hometeamstats = homestats.get_data()
awayteamstats = awaystats.get_data()

INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamestimatedmetrics/data_2018-19.json...
INFO:nba_survival.data.endpoints.base:Reading existing file ../nba-data/2018-19/teamestimatedmetrics/data_2018-19.json...


In [14]:
hometeamstats.head(n=10)

Unnamed: 0,TEAM_NAME,TEAM_ID,GP,W,L,W_PCT,MIN,E_OFF_RATING,E_DEF_RATING,E_NET_RATING,...,MIN_RANK,E_OFF_RATING_RANK,E_DEF_RATING_RANK,E_NET_RATING_RANK,E_AST_RATIO_RANK,E_OREB_PCT_RANK,E_DREB_PCT_RANK,E_REB_PCT_RANK,E_TM_TOV_PCT_RANK,E_PACE_RANK
0,Atlanta Hawks,1610612737,82,29,53,0.354,3971.0,105.5,111.2,-5.7,...,7,23,27,26,16,6,20,16,30,1
1,New Orleans Pelicans,1610612740,82,33,49,0.402,3951.0,108.6,110.4,-1.8,...,24,12,24,20,7,9,19,9,22,2
2,Los Angeles Lakers,1610612747,82,37,45,0.451,3956.0,105.2,107.3,-2.1,...,18,24,15,23,14,17,22,18,28,3
3,Sacramento Kings,1610612758,82,39,43,0.476,3946.0,108.3,108.7,-0.4,...,27,16,20,16,15,12,26,24,5,3
4,Milwaukee Bucks,1610612749,82,60,22,0.732,3956.0,111.5,103.0,8.5,...,18,3,1,1,10,27,1,3,6,5
5,Oklahoma City Thunder,1610612760,82,49,33,0.598,3971.0,107.6,104.7,2.9,...,7,17,5,11,27,3,7,6,8,6
6,LA Clippers,1610612746,82,48,34,0.585,3966.0,109.7,109.0,0.7,...,11,10,21,14,20,19,25,17,20,7
7,Philadelphia 76ers,1610612755,82,51,31,0.622,3961.0,110.4,107.1,3.3,...,15,7,14,7,5,8,6,2,24,8
8,Washington Wizards,1610612764,82,32,50,0.39,3986.0,108.6,111.3,-2.7,...,2,13,28,25,9,25,29,29,11,9
9,Brooklyn Nets,1610612751,82,42,40,0.512,3996.0,106.9,107.0,-0.1,...,1,19,13,15,23,10,21,15,25,10


## Generate the pipeline

In [15]:
flow = gen_pipeline(
    pbp=pbp_df,
    header=header,
    stats=pd.concat([hometeamstats, awayteamstats]).reset_index(drop=True),
    last_meeting=last_meeting,
    gamelog=pd.concat([homegamelog, visitorgamelog]).reset_index(drop=True),
    lineup_stats=pd.concat([homelineupstats, awaylineupstats]).reset_index(drop=True),
    home_rotation=homerotation,
    away_rotation=awayrotation,
)

## Run the pipeline

In [16]:
output = flow.run()

[2021-01-31 06:04:29+0000] INFO - prefect.FlowRunner | Beginning Flow run for 'Transform raw NBA data'
INFO:prefect.FlowRunner:Beginning Flow run for 'Transform raw NBA data'
[2021-01-31 06:04:29+0000] INFO - prefect.TaskRunner | Task 'Add surival time': Starting task run...
INFO:prefect.TaskRunner:Task 'Add surival time': Starting task run...
[2021-01-31 06:04:29+0000] INFO - prefect.TaskRunner | Task 'Add surival time': Finished task run for task with final state: 'Success'
INFO:prefect.TaskRunner:Task 'Add surival time': Finished task run for task with final state: 'Success'
[2021-01-31 06:04:29+0000] INFO - prefect.TaskRunner | Task 'Backfill margin': Starting task run...
INFO:prefect.TaskRunner:Task 'Backfill margin': Starting task run...
[2021-01-31 06:04:29+0000] INFO - prefect.TaskRunner | Task 'Backfill margin': Finished task run for task with final state: 'Success'
INFO:prefect.TaskRunner:Task 'Backfill margin': Finished task run for task with final state: 'Success'
[2021-01-

In [17]:
final = output.result[flow.get_tasks(name="Add lineup plus minus")[0]].result

In [18]:
features = [
    "TIME",
    "SCOREMARGIN",
    "HOME_LINEUP_PLUS_MINUS",
    "VISITOR_LINEUP_PLUS_MINUS",
    "HOME_NET_RATING",
    "VISITOR_NET_RATING",
    "HOME_W_PCT",
    "VISITOR_W_PCT",
    "LAST_GAME_WIN",
    "HOME_GAMES_IN_LAST_3_DAYS",
    "VISITOR_GAMES_IN_LAST_3_DAYS",
    "HOME_GAMES_IN_LAST_5_DAYS",
    "VISITOR_GAMES_IN_LAST_5_DAYS",
    "HOME_GAMES_IN_LAST_7_DAYS",
    "VISITOR_GAMES_IN_LAST_7_DAYS"
]

In [19]:
final[features].head(n=10)

Unnamed: 0,TIME,SCOREMARGIN,HOME_LINEUP_PLUS_MINUS,VISITOR_LINEUP_PLUS_MINUS,HOME_NET_RATING,VISITOR_NET_RATING,HOME_W_PCT,VISITOR_W_PCT,LAST_GAME_WIN,HOME_GAMES_IN_LAST_3_DAYS,VISITOR_GAMES_IN_LAST_3_DAYS,HOME_GAMES_IN_LAST_5_DAYS,VISITOR_GAMES_IN_LAST_5_DAYS,HOME_GAMES_IN_LAST_7_DAYS,VISITOR_GAMES_IN_LAST_7_DAYS
0,0,0,3.2,3.6,6.5,3.3,0.8,0.68,1,1.0,0.0,2.0,1.0,3.0,2.0
1,0,0,3.2,3.6,6.5,3.3,0.8,0.68,1,1.0,0.0,2.0,1.0,3.0,2.0
2,22,-2,3.2,3.6,6.5,3.3,0.8,0.68,1,1.0,0.0,2.0,1.0,3.0,2.0
3,36,-2,3.2,3.6,6.5,3.3,0.8,0.68,1,1.0,0.0,2.0,1.0,3.0,2.0
4,38,-2,3.2,3.6,6.5,3.3,0.8,0.68,1,1.0,0.0,2.0,1.0,3.0,2.0
5,41,-2,3.2,3.6,6.5,3.3,0.8,0.68,1,1.0,0.0,2.0,1.0,3.0,2.0
6,44,-2,3.2,3.6,6.5,3.3,0.8,0.68,1,1.0,0.0,2.0,1.0,3.0,2.0
7,67,-2,3.2,3.6,6.5,3.3,0.8,0.68,1,1.0,0.0,2.0,1.0,3.0,2.0
8,71,-2,3.2,3.6,6.5,3.3,0.8,0.68,1,1.0,0.0,2.0,1.0,3.0,2.0
9,72,-2,3.2,3.6,6.5,3.3,0.8,0.68,1,1.0,0.0,2.0,1.0,3.0,2.0
