In [None]:
%reload_ext autoreload
%autoreload 2
import os 
import sys
import pandas as pd
import numpy as np
from datetime import datetime
import plotly 
import plotly.graph_objects as go

sys.path.insert(0, "./../../src/")
from data_loader import load_data

## Loading the data

In [None]:
seasons = load_data()

In [None]:
# tmp_df = pd.read_csv("./../../data/raw/2017-18_pbp.csv")
# display(tmp_df)
display(seasons)

In [None]:
seasons_subset = seasons[seasons["season_name"] == "2018-19"]
display(seasons_subset.head())

## Visualizations 
Below you find a vizualization for each of the features found in the data  set to 

In [None]:
# Histogram of event types
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=seasons_subset["EVENTMSGTYPE"])
)
fig.update_layout(
    title_text="Histogram of Event Types", 
    xaxis_title="Event type name",
    yaxis_title="Frequency"
)
fig.show()

In [None]:
# Histogram of event action types
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=seasons_subset["EVENTMSGACTIONTYPE"])
)
fig.update_layout(
    title_text="Histogram of Event Action Types", 
    xaxis_title="Event action type name",
    yaxis_title="Frequency"
)
fig.show()

In [None]:
# Number of events per game 
events_per_game_count = seasons_subset.groupby("GAME_ID").count()["EVENTNUM"]

# Histogram of event action types
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=events_per_game_count)
)
fig.update_layout(
    title_text="Histogram of events per game", 
    xaxis_title="Number of events",
    yaxis_title="Count"
)
fig.show()
print(events_per_game_count.median(), events_per_game_count.std())

## Finding the average game duration
As well as the shortest and longest games

In [None]:
duration_of_each_game = seasons_subset.groupby("GAME_ID").count()["WCTIMESTRING"]	

In [None]:
# match all valid dates, and create new dataframes 
correct_time_format_regex = "^\s?([0][0-9]|[1][0-2]|[0-9]):[0-5][0-9]\s?(?:AM|PM|am|pm)$"
correct_t_mask = seasons["WCTIMESTRING"].str.match(correct_time_format_regex)
correct_t_df = seasons[correct_t_mask]
wrong_t_df = seasons[~correct_t_mask]

print(f"The number of wrong entries in df {len(wrong_t_df['WCTIMESTRING'])}"\
      f", of those unique {len(wrong_t_df['WCTIMESTRING'].unique())}")
display(correct_t_df)

# convert times into 24h format
correct_t_df["WCTIMESTRING"] = pd.to_datetime(correct_t_df["WCTIMESTRING"], 
                                              format='%I:%M %p').dt.strftime('%H:%M')

In [None]:
grouped_time_df = correct_t_df.sort_values("EVENTNUM").groupby(["GAME_ID"]).agg({'WCTIMESTRING': list})
grouped_time_df

In [None]:
from datetime import timedelta
grouped_time_df["game_start_time"] = np.nan
grouped_time_df["game_end_time"] = np.nan
for index, row in grouped_time_df.iterrows():
    # very dumb handling of edge cases and wrong values
    until_midnight = []
    after_midnight = []
    for time_str in row["WCTIMESTRING"]:
        if time_str <= "23:59" and time_str >= "10:59":
            until_midnight.append(time_str)
        else:
            after_midnight.append(time_str)
    curr_row_arr = sorted(until_midnight) + sorted(after_midnight)
    start_time = pd.to_datetime(curr_row_arr[0], format='%H:%M')
    end_time = pd.to_datetime(curr_row_arr[-1], format='%H:%M')
    
    if start_time.hour > 10 and end_time.hour < 10:
        # add one day, if clock goes over midnight
        end_time = end_time + timedelta(hours=24)
    grouped_time_df.loc[index, "game_start_time"] = start_time
    grouped_time_df.loc[index, "game_end_time"] = end_time

grouped_time_df["game_duration"] = grouped_time_df["game_end_time"] - \
                                            grouped_time_df["game_start_time"]
# convert to minutes
grouped_time_df["game_duration"] = grouped_time_df["game_duration"].dt.total_seconds().div(60).astype(int)
display(grouped_time_df)

In [None]:
# remove wrong durations, no game lasts that long
grouped_time_df = grouped_time_df[(grouped_time_df["game_duration"] < 300) & (grouped_time_df["game_duration"] > 0)]    
# Histogram of event action types
fig = go.Figure()
fig.add_trace(go.Histogram(x=grouped_time_df["game_duration"]))
fig.update_layout(
    title_text="Distribution of game durations", 
    xaxis_title="Game duration in minutes",
    yaxis_title="Count"
)
fig.show()

In [None]:
# find wrong game durations 
# wrong_durations_df = grouped_time_df[grouped_time_df["game_duration"] <= 0]
wrong_durations_df = grouped_time_df[grouped_time_df["game_duration"] < 300]
display(wrong_durations_df)

In [None]:
wrong_durations_df.iloc[-1]["WCTIMESTRING"]