# Fetching Data

This notebook includes code that fetches data from balldontlie API and then saves it as a csv file.

In [2]:
import requests
import pandas as pd
import time
import os

In [55]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [275]:
def make_request(endpoint, next_cursor=0, params=None, verbose=False):
    root = "https://api.balldontlie.io/v1/"
    api_key = os.environ["BALLDONTLIE_API_KEY"]
    headers = {"Authorization": api_key}
    if params is None: params = {}
    # This API uses cursor based pagination.
    # The cursor should be initialized to 0 so that the first requests will fetch the first page.
    # If there is more than one page to the request, the response will include a "next_cursor" attribute in 
    # the meta data JSON.
    # In the next request, set the cursor parameter to this number to get the next page
    if verbose: print("Starting Loop")
    df_list = []
    request_count = 0
    while next_cursor is not None:
        if verbose: print("Making request... ")
        response = requests.get(root + endpoint, headers=headers, params=params)
        request_count += 1
        if response.status_code != 200:
            print(f"Request failed: {response.status_code}")
            print(response.json())
            break
        if verbose: print(f"Request Succeeded - {response.status_code}" )
        res = response.json()
        data = res["data"]
        if data:
            df_list.append(pd.json_normalize(data))
        meta_data = res.get("meta", None)
        if meta_data is None:  # Enpoint doesn't support pagination, no need to loop
            break
        next_cursor = meta_data.get("next_cursor")
        if next_cursor is None:  # Last page reached, no need to loop
            break
        params.update({"cursor": next_cursor})
        # Max 60 requests per minute, sleep if necessary
        if request_count % 60 == 0:
            print("Max 60 requests per minute, sleeping for 60 seconds..")
            time.sleep(60)
    # Concatenate all collected data into one DataFrame
    if df_list:
        return pd.concat(df_list)
    else:
        return pd.DataFrame()

In [None]:
def get_recent_games(home_team_id, away_team_id):
    """
    Get a list game ids for the 20 most recent games played for each team specified.

    ---Params---
    home_team_id: int
    away_team_id: int

    ---Returns---
     a tuple of 2 lists. ---> ([home team game ids], [away team game ids])
    """

    # Ensure that the ids are integers
    home_team_id = int(home_team_id)
    away_team_id = int(away_team_id)

    # Get todays date
    today = date.today()                                                          # Get today
    today = today.strftime("%Y-%m-%d")
    one_year_ago = date.today() - timedelta(days=365)                              # Get last-year-today
    one_year_ago = one_year_ago.strftime("%Y-%m-%d")  # convert to format yyyy-mm-dd

    # get home team recent games
    recent_games_home = pd.DataFrame()
    
    print(today)
    print(one_year_ago)
    res = make_request("games", params={"end_date": today,
                                        "start_date": one_year_ago,
                                        "team_ids[]": [home_team_id, 0],  # No idea how requests is bulding the query string, but the api is throwing a "invalid value" error when there's only one value, so need to pass a dummy value of 0 to get it to work
                                        "per_page": "100"})
    res = res.sort_values("date", ascending=False)
    return res["home_team"]
    res = res[res["home_team.id"].eq(home_team_id)]

    # recent_games_home = pd.concat(recent_games_home, res)
    # recent_games_home = recent_games_home.head(20)
    # game_ids_home = list(recent_games_home["id"].values)

In [245]:
df = get_recent_games(12, 0)
df

2025-06-17
2024-06-17


Unnamed: 0,id,date,season,status,period,time,postseason,home_team_score,visitor_team_score,datetime,home_team,visitor_team
2,18444562,2025-06-16,2024,Final,4,Final,True,120,109,2025-06-17T00:30:00.000Z,"{'id': 21, 'conference': 'West', 'division': '...","{'id': 12, 'conference': 'East', 'division': '..."
1,18444561,2025-06-13,2024,Final,4,Final,True,104,111,2025-06-14T00:30:00.000Z,"{'id': 12, 'conference': 'East', 'division': '...","{'id': 21, 'conference': 'West', 'division': '..."
0,18444560,2025-06-11,2024,Final,4,Final,True,116,107,2025-06-12T00:30:00.000Z,"{'id': 12, 'conference': 'East', 'division': '...","{'id': 21, 'conference': 'West', 'division': '..."
99,18444559,2025-06-08,2024,Final,4,Final,True,123,107,2025-06-09T00:00:00.000Z,"{'id': 21, 'conference': 'West', 'division': '...","{'id': 12, 'conference': 'East', 'division': '..."
98,18444558,2025-06-05,2024,Final,4,Final,True,110,111,2025-06-06T00:30:00.000Z,"{'id': 21, 'conference': 'West', 'division': '...","{'id': 12, 'conference': 'East', 'division': '..."
...,...,...,...,...,...,...,...,...,...,...,...,...
4,15907496,2024-10-30,2024,Final,5,Final,False,135,132,2024-10-30T23:00:00.000Z,"{'id': 12, 'conference': 'East', 'division': '...","{'id': 2, 'conference': 'East', 'division': 'A..."
3,15907479,2024-10-28,2024,Final,4,Final,False,119,115,2024-10-28T23:00:00.000Z,"{'id': 22, 'conference': 'East', 'division': '...","{'id': 12, 'conference': 'East', 'division': '..."
2,15907474,2024-10-27,2024,Final,5,Final,False,114,118,2024-10-27T19:30:00.000Z,"{'id': 12, 'conference': 'East', 'division': '...","{'id': 23, 'conference': 'East', 'division': '..."
1,15907458,2024-10-25,2024,Final,4,Final,False,123,98,2024-10-25T23:30:00.000Z,"{'id': 20, 'conference': 'East', 'division': '...","{'id': 12, 'conference': 'East', 'division': '..."


In [254]:
# df = get_recent_games(12, 0)
df["home_team"].index

Index([ 2,  1,  0, 99, 98, 97, 96, 95, 94, 93,
       ...
        9,  8,  7,  6,  5,  4,  3,  2,  1,  0],
      dtype='int64', length=103)

In [239]:
df

In [151]:
game_data = make_request("games", params={"cursor":0, "per_page":1, "dates":["2025-06-16","2025-06-15","2025-06-14","2025-06-13","2025-06-12"]}, verbose=True)
game_data

Starting Loop
Making request... 
Request Succeeded - 200
Making request... 
Request Succeeded - 200
Making request... 
Request Succeeded - 200


Unnamed: 0,id,date,season,status,period,time,postseason,home_team_score,visitor_team_score,datetime,home_team,visitor_team
0,18444561,2025-06-13,2024,Final,4,Final,True,104,111,2025-06-14T00:30:00.000Z,"{'id': 12, 'conference': 'East', 'division': '...","{'id': 21, 'conference': 'West', 'division': '..."
0,18444562,2025-06-16,2024,Final,4,Final,True,120,109,2025-06-17T00:30:00.000Z,"{'id': 21, 'conference': 'West', 'division': '...","{'id': 12, 'conference': 'East', 'division': '..."


### game data

This data is only used for EDA purposes

In [None]:
game_data = make_request("games", params={"cursor":0, "per_page":100, "seasons":[2022,2023,2024]}, record_path=None, verbose=True)

Success!


In [141]:
game_data.head()

Unnamed: 0,id,date,season,status,period,time,postseason,home_team_score,visitor_team_score,datetime,home_team,visitor_team
0,18444561,2025-06-13,2024,Final,4,Final,True,104,111,2025-06-14T00:30:00.000Z,"{'id': 12, 'conference': 'East', 'division': '...","{'id': 21, 'conference': 'West', 'division': '..."
0,18444562,2025-06-16,2024,Final,4,Final,True,120,109,2025-06-17T00:30:00.000Z,"{'id': 21, 'conference': 'West', 'division': '...","{'id': 12, 'conference': 'East', 'division': '..."


In [17]:
game_data.set_index("id", inplace=True)

In [None]:
game_data.to_csv("data/games.csv")

### stats data

This is the data used to build the model. Individual player stats for every NBA game since 1979.

The first block of code is just to get the meta data. To see how many pages of data we are going to have to request (as balldontlie API serves a maximum of 100 rows of data per request.)

## BELOW CODE NEEDS TO BE REFACTORED TO FIT THE NEW API CHANGES

To do later, we don't need to rebuild the model right now

In [None]:
# there are over 11000 pages!
all_stats_data_meta = make_request("stats", params={"page":1, "per_page":100}, record_path=None)

all_stats_data_meta

This code gets all 11483 pages of data from the API and saves it in a dataframe

In [None]:
stats_data = pd.DataFrame()

for i in range(1, 11483):
    # Print what page we're on every 10 pages to keep track of progress
    if i % 10 == 0:
        print(i)
        
    # Make sure not to exceed 60 API requests per minute (balldontlie API is free but limits request per minite)
    time.sleep(1.1)
    
    # Make the request and append to the dataframe
    new_data = make_request("stats", params={"page":i, "per_page":100}, record_path="data")
    stats_data = stats_data.append(new_data)
print("Done!")

This code saves the data to a csv file. It's commented out to not accidentally overwrite the current csv file as it took hours to pull all the data.

In [None]:
stats_data.set_index("id", inplace=True)

# stats_data.to_csv("data/stats_raw.csv")

In [None]:
for i in range(2, 11483):
    if i%100 == 0: print(i)
    time.sleep(1.5)
    new_data = make_request("stats", params={"page":i, "per_page":100}, record_path="data")
    all_stats_data = all_stats_data.append(new_data)
print("Done!")