In [37]:
import duckdb
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import polars as pl
from pandas.api.types import infer_dtype

In [26]:
# Using duckdb query to avoid "ValueError" from pd.read_json() 
def duck_json(file_str):
    df = duckdb.query(f'''SELECT * FROM read_json({file_str}, auto_detect=True, sample_size=100000)''').to_df() 
    df.name = file_str
    return df


shots = duck_json('shots.json')
games = duck_json('games.json')
points = duck_json('points.json')
matches = duck_json('matches.json')

df_list = [shots, games, points, matches]

In [27]:
shapes = {}
for df in df_list:
    shapes[df.name] = df.shape

shapes

{'shots.json': (62249, 20),
 'games.json': (2128, 12),
 'points.json': (11732, 17),
 'matches.json': (100, 17)}

In [38]:
# infer_dtype is pretty useful, will probably functionalize this later
shots_dtypes = {}
for col in shots.columns:
    shots_dtypes[col] = infer_dtype(shots[col])

shots_dtypes

{'match_id': 'floating',
 'user_id': 'floating',
 'sid': 'integer',
 'pid': 'integer',
 'set_id': 'integer',
 'game_id': 'integer',
 'player': 'string',
 'shot_type': 'string',
 'hit_court_side': 'string',
 'hit_type': 'string',
 'hit_location_long': 'string',
 'hit_location_lat': 'string',
 'hit_location': 'mixed',
 'hit_velocity': 'mixed',
 'hit_wing': 'string',
 'net_type': 'string',
 'bounce_court_side': 'string',
 'bounce_location_long': 'string',
 'bounce_location_lat': 'string',
 'bounce_location': 'mixed'}

In [41]:
shots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62249 entries, 0 to 62248
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   match_id              62249 non-null  float64
 1   user_id               52796 non-null  float64
 2   sid                   62249 non-null  int64  
 3   pid                   62249 non-null  int64  
 4   set_id                62249 non-null  int64  
 5   game_id               62249 non-null  int64  
 6   player                62249 non-null  object 
 7   shot_type             62249 non-null  object 
 8   hit_court_side        62249 non-null  object 
 9   hit_type              62249 non-null  object 
 10  hit_location_long     62249 non-null  object 
 11  hit_location_lat      62249 non-null  object 
 12  hit_location          62249 non-null  object 
 13  hit_velocity          62249 non-null  object 
 14  hit_wing              62249 non-null  object 
 15  net_type           

In [45]:
nolist_shots = shots.copy(deep = True)

# Get rid of unhashable columns for dictionary
nolist_shots = nolist_shots.drop(['hit_location', 'hit_velocity', 'bounce_location'], axis = 'columns')

shots_unique_vals = {}
for col in nolist_shots.columns:
    shots_unique_vals[col] = pd.unique(nolist_shots[col])

# I recommend during on scroll for this dictionary
shots_unique_vals

{'match_id': array([9.64239483e+47, 1.40705126e+48, 2.70115592e+47, 1.29562332e+48,
        5.26074506e+47, 3.23485238e+47, 9.00750802e+47, 2.45769257e+47,
        8.47861948e+47, 8.86922998e+47, 2.38088724e+47, 9.33140447e+46,
        1.08648182e+48, 5.31353632e+47, 7.21253803e+47, 1.04062547e+48,
        6.06385600e+47, 8.27383837e+47, 6.15434493e+47, 8.93286255e+45,
        3.41374592e+47, 1.18943352e+48, 3.45165938e+47, 1.18834967e+48,
        1.02639901e+48, 1.41251282e+48, 9.24518323e+47, 4.16780906e+47,
        4.13281520e+47, 1.13366570e+48, 1.26188679e+48, 9.37193324e+47,
        6.94698592e+47, 4.80016873e+47, 4.11120413e+47, 9.60882826e+47,
        9.24521909e+47, 7.26846270e+47, 1.04266754e+48, 2.69780713e+47,
        7.52182052e+46, 5.58556317e+47, 1.44765392e+48, 1.59578755e+47,
        6.69112456e+47, 3.67155304e+47, 2.12065335e+47, 1.29137449e+48,
        6.78517162e+47, 3.98888137e+47, 1.27391318e+48, 2.66504147e+47,
        2.18345738e+47, 8.75823036e+47, 6.70885915e+

In [51]:
# empty dataframe shows there's no shots that have 'net' and bounce over to other side, so this column is probably just to show if the shot hits net and doesn't go over
shots[(shots['net_type'] == 'net' ) & (shots['hit_court_side'] == 'near') & (shots['bounce_court_side'] == 'far')]

Unnamed: 0,match_id,user_id,sid,pid,set_id,game_id,player,shot_type,hit_court_side,hit_type,hit_location_long,hit_location_lat,hit_location,hit_velocity,hit_wing,net_type,bounce_court_side,bounce_location_long,bounce_location_lat,bounce_location
