In [37]:
import duckdb
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import polars as pl
from pandas.api.types import infer_dtype

In [16]:
# import pandas as pd

# monkeypatch using standard python json module
import json

pd.io.json._json.loads = lambda s, *a, **kw: json.loads(s)

# monkeypatch using faster simplejson module
import simplejson
pd.io.json._json.loads = lambda s, *a, **kw: simplejson.loads(s)

# normalising (unnesting) at the same time (for nested jsons)
pd.io.json._json.loads = lambda s, *a, **kw: pd.json_normalize(simplejson.loads(s))

In [26]:
# Using duckdb query to 
def duck_json(file_str):
    df = duckdb.query(f'''SELECT * FROM read_json({file_str}, auto_detect=True, sample_size=100000)''').to_df() 
    df.name = file_str
    return df


shots = duck_json('shots.json')
games = duck_json('games.json')
points = duck_json('points.json')
matches = duck_json('matches.json')

df_list = [shots, games, points, matches]

In [27]:
shapes = {}
for df in df_list:
    shapes[df.name] = df.shape

shapes

{'shots.json': (62249, 20),
 'games.json': (2128, 12),
 'points.json': (11732, 17),
 'matches.json': (100, 17)}

In [38]:
shots_dtypes = {}
for col in shots.columns:
    shots_dtypes[col] = infer_dtype(shots[col])

shots_dtypes

{'match_id': 'floating',
 'user_id': 'floating',
 'sid': 'integer',
 'pid': 'integer',
 'set_id': 'integer',
 'game_id': 'integer',
 'player': 'string',
 'shot_type': 'string',
 'hit_court_side': 'string',
 'hit_type': 'string',
 'hit_location_long': 'string',
 'hit_location_lat': 'string',
 'hit_location': 'mixed',
 'hit_velocity': 'mixed',
 'hit_wing': 'string',
 'net_type': 'string',
 'bounce_court_side': 'string',
 'bounce_location_long': 'string',
 'bounce_location_lat': 'string',
 'bounce_location': 'mixed'}

In [41]:
shots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62249 entries, 0 to 62248
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   match_id              62249 non-null  float64
 1   user_id               52796 non-null  float64
 2   sid                   62249 non-null  int64  
 3   pid                   62249 non-null  int64  
 4   set_id                62249 non-null  int64  
 5   game_id               62249 non-null  int64  
 6   player                62249 non-null  object 
 7   shot_type             62249 non-null  object 
 8   hit_court_side        62249 non-null  object 
 9   hit_type              62249 non-null  object 
 10  hit_location_long     62249 non-null  object 
 11  hit_location_lat      62249 non-null  object 
 12  hit_location          62249 non-null  object 
 13  hit_velocity          62249 non-null  object 
 14  hit_wing              62249 non-null  object 
 15  net_type           

In [45]:
nolist_shots = shots.copy(deep = True)
nolist_shots = nolist_shots.drop(['hit_location', 'hit_velocity', 'bounce_location'], axis = 'columns')

shots_unique_vals = {}
for col in nolist_shots.columns:
    shots_unique_vals[col] = pd.unique(nolist_shots[col])

shots_unique_vals

{'match_id': array([9.64239483e+47, 1.40705126e+48, 2.70115592e+47, 1.29562332e+48,
        5.26074506e+47, 3.23485238e+47, 9.00750802e+47, 2.45769257e+47,
        8.47861948e+47, 8.86922998e+47, 2.38088724e+47, 9.33140447e+46,
        1.08648182e+48, 5.31353632e+47, 7.21253803e+47, 1.04062547e+48,
        6.06385600e+47, 8.27383837e+47, 6.15434493e+47, 8.93286255e+45,
        3.41374592e+47, 1.18943352e+48, 3.45165938e+47, 1.18834967e+48,
        1.02639901e+48, 1.41251282e+48, 9.24518323e+47, 4.16780906e+47,
        4.13281520e+47, 1.13366570e+48, 1.26188679e+48, 9.37193324e+47,
        6.94698592e+47, 4.80016873e+47, 4.11120413e+47, 9.60882826e+47,
        9.24521909e+47, 7.26846270e+47, 1.04266754e+48, 2.69780713e+47,
        7.52182052e+46, 5.58556317e+47, 1.44765392e+48, 1.59578755e+47,
        6.69112456e+47, 3.67155304e+47, 2.12065335e+47, 1.29137449e+48,
        6.78517162e+47, 3.98888137e+47, 1.27391318e+48, 2.66504147e+47,
        2.18345738e+47, 8.75823036e+47, 6.70885915e+

In [46]:
shots[shots['game_id'] == 34]

Unnamed: 0,match_id,user_id,sid,pid,set_id,game_id,player,shot_type,hit_court_side,hit_type,hit_location_long,hit_location_lat,hit_location,hit_velocity,hit_wing,net_type,bounce_court_side,bounce_location_long,bounce_location_lat,bounce_location
27920,5.585563e+47,304391.0,1497,267,4,34,guest,none,near,feed,service_box,ad,"[-0.735584259, 9.5906073014, 3.3222968578]","[6.25856, 6.25856, 6.25856]",right,over,far,no_mans_land,ad,"[0.8793808222, 18.4152686131, 0.0]"
27921,5.585563e+47,308768.0,1498,267,4,34,host,first_serve,far,first_serve,no_mans_land,deuce,"[-1.1933859587, 24.6628364927, 2.8416075706]","[6.368470936, -34.4229051764, -2.7097574589]",top,over,near,service_box,deuce,"[2.2044792175, 6.4193863869, 0.0]"
27922,5.585563e+47,304391.0,1499,267,4,34,guest,return,near,ground_stroke,no_mans_land,deuce,"[3.976195097, 0.9253126266, 0.9291023016]","[-3.6967930133, 15.2540312636, 5.843856568]",right,over,far,no_mans_land,deuce,"[-0.9548494816, 20.4052047729, 0.0]"
27923,5.585563e+47,308768.0,1500,267,4,34,host,serve_plus_one,far,ground_stroke,out,deuce,"[-0.9739964008, 25.384285391, 1.331418395]","[4.2750089894, -22.4459109084, 3.365089892]",right,over,near,no_mans_land,deuce,"[3.1602272987, 3.899992466, 0.0]"
27924,5.585563e+47,304391.0,1501,268,4,34,guest,none,near,feed,service_box,deuce_alley,"[4.6187291145, 5.3944212082, 2.7534303665]","[4.69392, 4.69392, 4.69392]",right,over,far,service_box,ad,"[1.2595108747, 14.8317966461, 0.0]"
27925,5.585563e+47,308768.0,1502,268,4,34,host,first_serve,far,first_serve,no_mans_land,ad,"[0.8300427794, 22.5970482237, 1.8463392258]","[-4.678241251, -34.0389472901, -3.9631822738]",top,net,far,service_box,deuce,"[-0.6645260453, 15.8967018064, 0.0]"
27926,5.585563e+47,308768.0,1503,268,4,34,host,first_serve,far,first_serve,no_mans_land,ad,"[0.6659927368, 23.6611065964, 2.4008395672]","[-4.7111207238, -22.4517358947, 0.3244040382]",top,over,near,service_box,ad,"[-2.790088892, 7.3592247963, 0.0]"
27927,5.585563e+47,304391.0,1504,268,4,34,guest,return,near,ground_stroke,no_mans_land,ad_alley,"[-4.6384348869, 0.4143119934, 0.6960549355]","[6.8872871295, 12.5036508404, 7.4138423357]",left,over,far,no_mans_land,ad_out,"[6.3859579143, 19.388299942, 0.0]"
27928,5.585563e+47,308768.0,1505,269,4,34,host,first_serve,far,first_serve,no_mans_land,deuce,"[-0.8615190983, 23.4778236753, 2.8073174953]","[4.5862654993, -24.9024782431, -2.4043495956]",top,over,near,service_box,deuce,"[1.6619086266, 9.9026794434, 0.0]"
27929,5.585563e+47,304391.0,1506,269,4,34,guest,return,near,ground_stroke,service_box,deuce,"[2.0933594704, 6.359534848, -0.0426934958]","[-5.0245667178, 14.0640243662, 4.1389856031]",right,over,far,service_box,deuce,"[-2.0954909325, 17.5424575806, 0.0]"
