In [1]:
# Necessary Libraries
import pandas as pd
import numpy as np
import sys
import time
import boto3
from datetime import datetime, timedelta
from nba_api.stats.endpoints import AllTimeLeadersGrids, commonplayerinfo, CommonTeamRoster
from nba_api.stats.static import teams
from scipy.stats import chi2_contingency, ttest_ind

In [None]:
team_names = {
    'Atlanta Hawks': 'ATL',
    'Boston Celtics': 'BOS',
    'Cleveland Cavaliers': 'CLE',
    'New Orleans Pelicans': 'NOP',
    'Chicago Bulls': 'CHI',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'LA Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'Brooklyn Nets': 'BKN',
    'New York Knicks': 'NYK',
    'Orlando Magic': 'ORL',
    'Indiana Pacers': 'IND',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHX',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Antonio Spurs': 'SAS',
    'Oklahoma City Thunder': 'OKC',
    'Toronto Raptors': 'TOR',
    'Utah Jazz': 'UTA',
    'Memphis Grizzlies': 'MEM',
    'Washington Wizards': 'WAS',
    'Detroit Pistons': 'DET',
    'Charlotte Hornets': 'CHA'
}

In [None]:
def read_csv_from_s3(bucket_name, file_key):
    # Initialize Boto3 S3 client
    s3 = boto3.client('s3')
    try:
        # Read CSV file directly into Pandas DataFrame
        response = s3.get_object(Bucket=bucket_name, Key=file_key)
        df = pd.read_csv(response['Body'])
        return df
    except Exception as e:
        print(f"Error reading CSV file from S3: {e}")
        return None

In [None]:
def write_tuples_to_s3(tuples_list, bucket_name, file_key):
    # Initialize Boto3 S3 client
    s3 = boto3.client('s3')
    try:
        # Prepare data as string
        data = '\n'.join([' '.join(map(str, tup)) for tup in tuples_list])
        
        # Upload data to S3
        response = s3.put_object(Bucket=bucket_name, Key=file_key, Body=data.encode('utf-8'))
        
        print(f"Data written to s3://{bucket_name}/{file_key} successfully.")
    except Exception as e:
        print(f"Error writing data to S3: {e}")

In [None]:
def get_team_ids():
    # Retrieve all NBA teams
    nba_teams = teams.get_teams()
    
    team_ids = {}
    # Extract team IDs
    for team in nba_teams:
        team_ids[team["full_name"]] = team['id']
    
    return team_ids

In [2]:
# Collects all games for specified teams between a set of yea
def get_all_games(start_year = "2017-9-1", end_year = None, teams = "all"):
    start_year = datetime.strptime(start_year, "%Y-%m-%d")
    team_ids = get_team_ids()
    dfs = []
    if teams == "all":
        for team, team_id in team_ids.items():
            print(f"Attempting to collect {team}")
            gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team_id, timeout=60)
            games = gamefinder.get_data_frames()[0]
            games["GAME_DATE"] = games["GAME_DATE"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
            games = games[games["GAME_DATE"] >= start_year]
            dfs.append(games)
            print(f"Successfully collected {team}")
            time.sleep(2.5)
        return dfs
    else:
        for team in taems:
            team_id = team_ids[team]
            gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team_id)
            games = gamefinder.get_data_frames()[0]
            games["GAME_DATE"] = games["GAME_DATE"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
            games = games[games["GAME_DATE"] >= start_year]
            dfs.append(games)
        return dfs

In [None]:
def mark_birthday_games(games_df, players_df, day_range):
   
    birthday_indexes = []
    games_df["birthday_game"] = 0

    for index, player_row in players_df.iterrows():
        player_team = player_row["Team"]
        player_birthday = datetime.strptime(player_row["Birthday"], "%Y-%m-%d")
        
        birthday_range_start = player_birthday - timedelta(days=day_range)
        birthday_range_end = player_birthday + timedelta(days=day_range)
        
        games_to_check = games_df[(games_df["home_team_abbrev"] == player_team) | (games_df["visit_team_abbrev"] == player_team)]
        birthday_games = games_to_check[(games_to_check["game_date"].dt.month == birthday_range_start.month) & 
                                        (games_to_check['game_date'].dt.day >= birthday_range_start.day) &
                                        (games_to_check['game_date'].dt.day <= birthday_range_end.day)]
        if len(birthday_games.index) >= 1:
            birthday_indexes.append([*birthday_games.index])
    
    for i in birthday_indexes:
        # games_df["birthday_game"].iloc[i] = 1
        games_df.loc[i, "birthday_game"] = 1
        
    return games_df

In [None]:
# Collecting player birthday data from nba_api
filtered_player_birthdays = read_csv_from_s3("bttj-final-s3", "NBA_DOB_InSeason.csv")
active_player_birthdays = []
count = 0
for player_id in filtered_player_birthdays["id"]:
    count += 1
    if count % 100 == 0:
        print("Starting Sleep")
        time.sleep(300)
    player_info = commonplayerinfo.CommonPlayerInfo(player_id = player_id, timeout=60).get_data_frames()[0]
    time.sleep(.5)
    try:
        if (2017 <= player_info["FROM_YEAR"][0] <= 2024) or (2017 <= player_info["TO_YEAR"][0] <= 2024):
            print(f"Collected {player_info['DISPLAY_FIRST_LAST'][0]}")
            active_player_birthdays.append((player_info["DISPLAY_FIRST_LAST"][0], player_info["TEAM_ABBREVIATION"][0], player_info["BIRTHDATE"][0].split("T")[0], player_info["FROM_YEAR"][0], player_info["TO_YEAR"][0]))
    except TypeError:
        continue
        
print("Completed Collection")

In [None]:
# Function to mark if a team had covered the spread
def team_covered(games_df, teams):
    
    games_df["team_covered"] = 0
    covered = {}
    
    for team in teams:
        for index, game_row in games_df[games_df["TEAM_NAME"] == team].iterrows():
            if ((game_row["favorite"] == team) and (game_row["favorite_covered"] == 1)) | ((game_row["favorite"] != team) and (game_row["underdog_covered"] == 1)):
                covered[index] = 1
            else:
                covered[index] = 0
    
    games_df["team_covered"] = games_df.index.map(covered)
    
    return games_df

In [None]:
# Collecting all league games starting September 9 2017
# This date was chosen because sports betting became legal in the NBA for the season starting after this date
all_games = get_all_games(start_year = "2017-9-1")
all_games = pd.concat(all_games)
all_games

In [None]:
all_games_filtered_columns = all_games[["TEAM_NAME", "GAME_DATE", "MATCHUP", "WL"]]
all_games_filtered_columns = all_games_filtered_columns.reset_index()
all_games_filtered_columns = all_games_filtered_columns.drop("index", axis = 1)
all_games_filtered_columns["TEAM_NAME"] = all_games_filtered_columns["TEAM_NAME"].apply(lambda x: team_names[x])
all_games_filtered_columns = all_games_filtered_columns[~all_games_filtered_columns["MATCHUP"].isin(drop_games["0"])]
all_games_filtered_columns

In [None]:
# Reading in betting line data from s3
betting_lines = read_csv_from_s3("bttj-final-s3", "api-data.csv")

betting_lines = betting_lines.drop("Unnamed: 0", axis = 1)

betting_lines["game_date"] = pd.to_datetime(betting_lines["game_date"], format='%Y-%m-%d', errors="coerce")

In [None]:
# Cleaning player birthday dataframe
player_birthday_df = pd.DataFrame(active_player_birthdays, columns = ["Name", "Team", "Birthday", "From_Year", "To_Year"])
player_birthday_df = player_birthday_df.merge(filtered_player_birthdays[["Name", "id"]], on = ["Name"])
player_birthday_df = player_birthday_df[player_birthday_df["Team"] != ""]
player_birthday_df = player_birthday_df.reset_index().drop("index", axis = 1)

player_birthday_df

In [None]:
# Merging data to get home and away games for all teams
away_games = pd.merge(betting_lines, all_games_filtered_columns, how = "left", left_on = ["game_date", "visit_team_abbrev"], right_on = ["GAME_DATE", "TEAM_NAME"]).dropna()
home_games = pd.merge(betting_lines, all_games_filtered_columns, how = "left", left_on = ["game_date", "home_team_abbrev"], right_on = ["GAME_DATE", "TEAM_NAME"]).dropna()

merged_data = pd.concat([home_games, away_games]).reset_index().drop("index", axis = 1)
merged_data = merged_data.drop(["GAME_DATE"], axis = 1)

In [None]:
# Marking birthday games and spread
birthday_games_df = mark_birthday_games(merged_data, player_birthday_df, 1)
birthday_games_df = team_covered(birthday_games_df, pd.unique(birthday_games_df["home_team_abbrev"]))

In [None]:
# Statistics
def chi_square_test_for_team(group):
    contingency_table = pd.crosstab(group['team_covered'], group['birthday_game'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return chi2, p, dof, expected

def t_test_for_team(group):
    birthday_games = group[group['birthday_game'] == 1]['WL']
    birthday_games = birthday_games.apply(lambda x: wl_dict[x])
    non_birthday_games = group[group['birthday_game'] == 0]['WL']
    non_birthday_games = non_birthday_games.apply(lambda x: wl_dict[x])
    
    t_statistic, p_value = ttest_ind(birthday_games, non_birthday_games)
    return t_statistic, p_value

In [None]:
# Create full results list
full_results = []
# Group by 'TEAM_NAME' and apply chi-square test to check significance
results = birthday_games_df.groupby('TEAM_NAME').apply(chi_square_test_for_team)

# Print results for each team
full_results.append(("Chi-Square"))
for team, result in results.items():
    chi2, p, dof, expected = result
    if p <= .1:
        full_results.append((team, "Chi-square Statistic:", chi2, "P-value:", p))
        
# Group by "Team Name" and apply t-test for each team
results = birthday_games_df.groupby('TEAM_NAME').apply(t_test_for_team)

# Print results for each team
full_results.append(("T-Test Team"))
for team, result in results.items():
    t_statistic, p_value = result
    if p_value <= .10:
        full_results.append(("Team:", team, "T-Statistic:", t_statistic, "P-value:", p_value))

# Group by "Team Name" and "Home Team abbrev (city)" and apply t-test for each team
results = birthday_games_df.groupby(['TEAM_NAME', 'home_team_abbrev']).apply(t_test_for_team)

# Print results for each team and city
full_results.append(("T-Test Team and City"))
for (team, city), result in results.items():
    t_statistic, p_value = result
    if (p_value < .05) and (abs(t_statistic) != np.inf):
        full_results.append(("Team:", team, "City:", city, "T-Statistic:", t_statistic, "P-value:", p_value))
        
# Print results to s3
write_tuples_to_s3(full_results, "bttj-final-s3", "Results.txt")