# # Data Extraction Verification

In [1]:
import pandas as pd
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [6]:
RAW_DATA_PATH = "../data/raw/lebron_shots_raw.csv"

In [7]:
# Check if file exists
if os.path.exists(RAW_DATA_PATH):
    print(f"✓ File exists: {RAW_DATA_PATH}")
    df = pd.read_csv(RAW_DATA_PATH)
else:
    raise FileNotFoundError(f"✗ File not found: {RAW_DATA_PATH}")

✓ File exists: ../data/raw/lebron_shots_raw.csv


In [8]:
# Overview of the dataset
print("=" * 70)
print("EXTRACTION VERIFICATION")
print("=" * 70)
print(f"Total rows (shots): {len(df):,}")
print(f"Total columns: {len(df.columns)}")
print("\nColumn names:")
display(df.columns.to_list())

EXTRACTION VERIFICATION
Total rows (shots): 36,780
Total columns: 26

Column names:


['GRID_TYPE',
 'GAME_ID',
 'GAME_EVENT_ID',
 'PLAYER_ID',
 'PLAYER_NAME',
 'TEAM_ID',
 'TEAM_NAME',
 'PERIOD',
 'MINUTES_REMAINING',
 'SECONDS_REMAINING',
 'EVENT_TYPE',
 'ACTION_TYPE',
 'SHOT_TYPE',
 'SHOT_ZONE_BASIC',
 'SHOT_ZONE_AREA',
 'SHOT_ZONE_RANGE',
 'SHOT_DISTANCE',
 'LOC_X',
 'LOC_Y',
 'SHOT_ATTEMPTED_FLAG',
 'SHOT_MADE_FLAG',
 'GAME_DATE',
 'HTM',
 'VTM',
 'api_extraction_season',
 'api_extraction_season_type']

In [9]:
print("\nSample shots data:")
display(df.head())


Sample shots data:


Unnamed: 0,GRID_TYPE,GAME_ID,GAME_EVENT_ID,PLAYER_ID,PLAYER_NAME,TEAM_ID,TEAM_NAME,PERIOD,MINUTES_REMAINING,SECONDS_REMAINING,...,SHOT_DISTANCE,LOC_X,LOC_Y,SHOT_ATTEMPTED_FLAG,SHOT_MADE_FLAG,GAME_DATE,HTM,VTM,api_extraction_season,api_extraction_season_type
0,Shot Chart Detail,20300014,20,2544,LeBron James,1610612739,Cleveland Cavaliers,1,8,56,...,15.0,158.0,0.0,1,1,20031029,SAC,CLE,2003,Regular Season
1,Shot Chart Detail,20300014,28,2544,LeBron James,1610612739,Cleveland Cavaliers,1,7,44,...,13.0,-131.0,2.0,1,1,20031029,SAC,CLE,2003,Regular Season
2,Shot Chart Detail,20300014,35,2544,LeBron James,1610612739,Cleveland Cavaliers,1,7,1,...,16.0,163.0,5.0,1,1,20031029,SAC,CLE,2003,Regular Season
3,Shot Chart Detail,20300014,39,2544,LeBron James,1610612739,Cleveland Cavaliers,1,6,29,...,14.0,-118.0,92.0,1,0,20031029,SAC,CLE,2003,Regular Season
4,Shot Chart Detail,20300014,54,2544,LeBron James,1610612739,Cleveland Cavaliers,1,4,48,...,5.0,54.0,22.0,1,1,20031029,SAC,CLE,2003,Regular Season


In [10]:
# Shot distribution by season
print("\nShot distribution by season:")
season_counts = df['api_extraction_season'].value_counts().sort_index()
display(season_counts)


Shot distribution by season:


api_extraction_season
2003    1492
2004    1684
2005    2130
2006    2020
2007    1917
2008    1925
2009    1739
2010    1858
2011    1671
2012    1786
2013    1693
2014    1823
2015    1833
2016    1728
2017    2090
2018    1095
2019    1685
2020     937
2021    1221
2022    1514
2023    1368
2024    1358
2025     213
Name: count, dtype: int64

In [11]:
# Shot type distribution
print("\nShot type distribution:")
display(df['SHOT_TYPE'].value_counts())


Shot type distribution:


SHOT_TYPE
2PT Field Goal    27952
3PT Field Goal     8828
Name: count, dtype: int64

In [12]:
# Made vs Missed
print("\nMade vs Missed:")
display(df['SHOT_MADE_FLAG'].value_counts())


Made vs Missed:


SHOT_MADE_FLAG
1    18567
0    18213
Name: count, dtype: int64

In [13]:
# Data types
print("\nData types:")
display(df.dtypes)


Data types:


GRID_TYPE                      object
GAME_ID                         int64
GAME_EVENT_ID                   int64
PLAYER_ID                       int64
PLAYER_NAME                    object
TEAM_ID                         int64
TEAM_NAME                      object
PERIOD                          int64
MINUTES_REMAINING               int64
SECONDS_REMAINING               int64
EVENT_TYPE                     object
ACTION_TYPE                    object
SHOT_TYPE                      object
SHOT_ZONE_BASIC                object
SHOT_ZONE_AREA                 object
SHOT_ZONE_RANGE                object
SHOT_DISTANCE                 float64
LOC_X                         float64
LOC_Y                         float64
SHOT_ATTEMPTED_FLAG             int64
SHOT_MADE_FLAG                  int64
GAME_DATE                       int64
HTM                            object
VTM                            object
api_extraction_season           int64
api_extraction_season_type     object
dtype: objec