## Game timestamp standardization
In this notebook I will try to standardize time in the data set for in game analysis.

In [None]:
%reload_ext autoreload
%autoreload 2
import os 
import sys
import pandas as pd
import numpy as np
from datetime import datetime
import plotly 
import plotly.graph_objects as go

sys.path.insert(0, "./../../src/")
from data_loader import load_data

In [None]:
%%time

# force_recompute flag if we want to completly reload data
seasons = load_data(single_df=True, force_recompute=False)
display(seasons)

## Load optimization
Not related to this use case, but I need it for wiki so I describe how I optimized space and computation.

In [None]:
if 2 == 3:
    display(seasons.dtypes)
    for col in seasons:
        print(col, len(seasons[col].unique()))

    categories_arr = ['EVENTMSGACTIONTYPE', 'EVENTMSGTYPE', 'GAME_ID', 'PERIOD', 'PERSON1TYPE',
           'PERSON2TYPE', 'PERSON3TYPE', 'PLAYER1_ID', 'PLAYER1_NAME',
           'PLAYER1_TEAM_ABBREVIATION', 'PLAYER1_TEAM_CITY', 'PLAYER1_TEAM_ID',
           'PLAYER1_TEAM_NICKNAME', 'PLAYER2_ID', 'PLAYER2_NAME',
           'PLAYER2_TEAM_ABBREVIATION', 'PLAYER2_TEAM_CITY', 'PLAYER2_TEAM_ID',
           'PLAYER2_TEAM_NICKNAME', 'PLAYER3_ID', 'PLAYER3_NAME',
           'PLAYER3_TEAM_ABBREVIATION', 'PLAYER3_TEAM_CITY', 'PLAYER3_TEAM_ID',
           'PLAYER3_TEAM_NICKNAME', 'season_name', 'home_shot_distance',
           'visitor_shot_distance']
    seasons[categories_arr] = seasons[categories_arr].astype('category')
    """
    display(seasons)
    with open("tmp_test_category.pkl", "wb") as file:
        pickle.dump(seasons, file)
    """
    display(seasons.dtypes)

## Finding seasons which have corrupt timestamps

In [None]:
duration_of_each_game = seasons.groupby("GAME_ID").count()["WCTIMESTRING"]	
display(duration_of_each_game)

In [None]:
correct_time_format_regex = "^\s?([0][0-9]|[1][0-2]|[0-9]):[0-5][0-9]\s?(?:AM|PM|am|pm)$"
correct_t_mask = seasons["WCTIMESTRING"].str.match(correct_time_format_regex)
correct_t_df = seasons[correct_t_mask]
wrong_t_df = seasons[~correct_t_mask]

wrong_data_grouped_game = wrong_t_df.groupby("season_name").count()["GAME_ID"]
correct_data_grouped_game = correct_t_df.groupby("season_name").count()["GAME_ID"]
# replacing string values for plotting purposes
correct_data_grouped_game.index = correct_data_grouped_game.index.str.replace("-", "-20")
wrong_data_grouped_game.index = wrong_data_grouped_game.index.str.replace("-", "-20")

display(wrong_data_grouped_game)
display(correct_data_grouped_game)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=wrong_data_grouped_game.index,
    y=wrong_data_grouped_game.values,
    name="Wrong count"
    )
)
fig.add_trace(go.Scatter(
    x=correct_data_grouped_game.index,
    y=correct_data_grouped_game.values,
    name="Correct count"
    )
)
fig.update_layout(
    title_text="Count of wrong and right WCTIMESTRING values for each season", 
    xaxis_title="Season",
    yaxis_title="Count"
)
fig.show()