In [None]:
%reload_ext autoreload
%autoreload 2
import os 
import sys
import pandas as pd
import numpy as np
from datetime import datetime
import plotly 
import plotly.graph_objects as go

sys.path.insert(0, "./../../src/")
from data_loader import load_data

## Loading the data

In [None]:
seasons = load_data()

In [None]:
# tmp_df = pd.read_csv("./../../data/raw/2017-18_pbp.csv")
# display(tmp_df)
display(seasons)

In [None]:
seasons_subset = seasons[seasons["season_name"] == "2018-19"]
display(seasons_subset.head())

## Visualizations 
Below you find a vizualization for each of the features found in the data  set to 

In [None]:
# Histogram of event types
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=seasons_subset["EVENTMSGTYPE"])
)
fig.update_layout(
    title_text="Histogram of Event Types", 
    xaxis_title="Event type name",
    yaxis_title="Frequency"
)
fig.show()

In [None]:
# Histogram of event action types
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=seasons_subset["EVENTMSGACTIONTYPE"])
)
fig.update_layout(
    title_text="Histogram of Event Action Types", 
    xaxis_title="Event action type name",
    yaxis_title="Frequency"
)
fig.show()

In [None]:
# Number of events per game 
events_per_game_count = seasons_subset.groupby("GAME_ID").count()["EVENTNUM"]

# Histogram of event action types
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=events_per_game_count)
)
fig.update_layout(
    title_text="Histogram of Event Action Types", 
    xaxis_title="Event action type name",
    yaxis_title="Frequency"
)
fig.show()

## Finding the average game duration
As well as the shortest and longest games

In [None]:
duration_of_each_game = seasons_subset.groupby("GAME_ID").count()["WCTIMESTRING"]	

In [None]:
# match all valid dates, and create new dataframes 
correct_time_format_regex = "^\s?([0][0-9]|[1][0-2]|[0-9]):[0-5][0-9]\s?(?:AM|PM|am|pm)$"
correct_t_mask = seasons["WCTIMESTRING"].str.match(correct_time_format_regex)
correct_t_df = seasons[correct_t_mask]
wrong_t_df = seasons[~correct_t_mask]

print(f"The number of wrong entries in df {len(wrong_t_df['WCTIMESTRING'])}"\
      f", of those unique {len(wrong_t_df['WCTIMESTRING'].unique())}")
display(correct_t_df)

# convert times into 24h format
correct_t_df["WCTIMESTRING"] = pd.to_datetime(correct_t_df["WCTIMESTRING"], 
                                              format='%I:%M %p').dt.strftime('%H:%M')

In [None]:
tmp_df_23 = correct_t_df.groupby(["GAME_ID"]).agg({'WCTIMESTRING': [np.min,np.max]})
tmp_df_23

In [None]:
# TODO handle scenario when time are close to 00 and it messes up
# convert datetime to minutes
game_duration_s = pd.to_datetime(tmp_df_23["WCTIMESTRING"]["amax"], format='%H:%M') - \
pd.to_datetime(tmp_df_23["WCTIMESTRING"]["amin"], format='%H:%M')

# Histogram of event action types
fig = go.Figure()
fig.add_trace(go.Histogram(
    x=game_duration_s)
)
fig.update_layout(
    title_text="Distribution of game durations", 
    xaxis_title="Game duration in minutes",
    yaxis_title="Frequency"
)
fig.show()