### NBA Play-By-Play Data: EDA

In [None]:
import os
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_colwidth", None)

%load_ext lab_black

What would it look like if you plotted the score of a game by time?

In [None]:
id_fields = ["GameType", "Date", "Quarter", "SecLeft", "AwayTeam", "HomeTeam"]

In [None]:
player_field = [
    "Shooter",
    "Assister",
    "Blocker",
    "Fouler",
    "Rebounder",
    "ViolationPlayer",
    "FreeThrowShooter",
    "TurnoverPlayer",
    "TurnoverCauser",
]

In [None]:
df = pd.read_csv("NBA_PBP_2019-20.csv")
df["Play"] = df["AwayPlay"].fillna("") + df["HomePlay"].fillna("")

In [None]:
df["shot"] = df["ShotType"].fillna("") + df["FreeThrowNum"].fillna("").apply(
    lambda x: "1" if len(x) > 0 else x
)
df["outcome"] = df["ShotOutcome"].fillna("") + df["FreeThrowOutcome"].fillna("")
df["scorer"] = df["Shooter"].fillna("") + df["FreeThrowShooter"].fillna("")

In [None]:
df["points"] = df["shot"].apply(get_points)

In [None]:
def get_points(shot):
    try:
        return int(shot[0])
    except:
        return 0

### Group by the game level

In [None]:
games = df.groupby("URL")

In [None]:
summary = games.agg(
    {
        "GameType": max,
        "HomeTeam": max,
        "AwayTeam": max,
        "HomeScore": max,
        "AwayScore": max,
        "Date": max,
        "Time": max,
    }
).merge(
    games.apply(summarise_game).reset_index().set_index("URL").drop("level_1", axis=1),
    left_index=True,
    right_index=True,
)

summary.to_csv("data/2019_20_season_games.csv")

In [None]:
summary["TopScorer"].value_counts().iloc[:20]

In [None]:
game_id = "/boxscores/202010110MIA.html"
game = df.query("URL == @game_id").copy()

In [None]:
def summarise_game(plays):
    """Summarises a game to leading scorer etc"""
    # get leading scorer
    highest_scorers = (
        plays.query("outcome == 'make'")
        .groupby("scorer")["points"]
        .sum()
        .sort_values(ascending=False)
    )
    top_scorer = highest_scorers.index[0]
    top_scorer_total = highest_scorers[0]

    # get leading assister
    highest_assists = (
        plays.groupby("Assister")["ShotOutcome"].count().sort_values(ascending=False)
    )
    top_assister = highest_assists.index[0]
    top_assist_total = highest_assists[0]

    # get leading rebounder
    highest_rebounds = (
        plays.query("Rebounder != 'Team'")
        .groupby("Rebounder")["ReboundType"]
        .count()
        .sort_values(ascending=False)
    )
    top_rebounder = highest_rebounds.index[0]
    top_rebound_total = highest_rebounds[0]

    box_score_summary = pd.DataFrame.from_dict(
        {
            "TopScorer": top_scorer,
            "TopScorerPoints": top_scorer_total,
            "TopAssister": top_assister,
            "TopAssisterTotal": top_assist_total,
            "TopRebounder": top_rebounder,
            "TopRebounderTotal": top_rebound_total,
        },
        orient="index",
    ).T

    #     print(f"Top scorer in the plays: {top_scorer} with {top_scorer_total} points.")
    #     print(f"Top assister in the plays: {top_assister} with {top_assist_total} assists.")
    #     print(f"Top rebounder in the plays: {top_rebounder} with {top_rebound_total} rebounds.")

    return box_score_summary

In [None]:
game["points"] = game["ShotType"].apply(get_points)

In [None]:
game.query("ShotType == ShotType").query("ShotOutcome == 'make'")[
    ["HomeScore", "AwayScore", "Shooter", "ShotType", "ShotOutcome", "points"]
].head()

In [None]:
df[id_fields + player_field].query("Assister == Assister")

In [None]:
df.iloc[:5, 1:]

In [None]:
df.filter(like="er")

In [None]:
df.tail()

In [None]:
df.filter(like="Player")

In [None]:
df.head()

In [None]:
df = pd.read_csv("NBA_PBP_2020-21.csv")
df["Play"] = df["AwayPlay"].fillna("") + df["HomePlay"].fillna("")

In [None]:
# can make this into an event stream, just need to get an identifier for the player

In [None]:
for row in df.iloc[:110].iterrows():
    data = row[1]

    quarter = data["Quarter"]
    time = data["SecLeft"]
    play = data["Play"]

    time_mins = f"{str(time//60).zfill(2)}:{str(time%60).zfill(2)}"

    print(f"Q{quarter} - {time_mins} - {play}")