In [1]:
import pandas as pd
import numpy as np
from functools import reduce
from nba_api.stats.endpoints import leaguegamelog
import helper_functions as hf
from IPython.display import clear_output
import os
import mysql.connector
from config import conn_host, conn_database, conn_user, conn_password

In [2]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
        db.close()
    except Exception as e:
        print(e)
    return resp

In [3]:
first_data_season = 2002
first_season = 2008
last_season = 2022

n_last_games = 10
n_last_specific_games = 5

pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
season_games = execute_query(f"SELECT g.id, g.date, g.season, g.is_playoff, g.winner, g.home_id, ht.name as home_name, g.home_pts, g.home_fgm, g.home_fga, g.home_fg_pct, g.home_fg3m, g.home_fg3a, g.home_fg3_pct, g.home_ftm, g.home_fta, g.home_ft_pct, g.home_oreb, g.home_dreb, g.home_reb, g.home_ast, g.home_stl, g.home_blk, g.home_tov, g.home_pf, g.away_id, at.name as away_name, g.away_pts, g.away_fgm, g.away_fga, g.away_fg_pct, g.away_fg3m, g.away_fg3a, g.away_fg3_pct, g.away_ftm, g.away_fta, g.away_ft_pct, g.away_oreb, g.away_dreb, g.away_reb, g.away_ast, g.away_stl, g.away_blk, g.away_tov, g.away_pf, g.home_odds, g.away_odds FROM games AS g LEFT JOIN teams as ht ON g.home_id = ht.id LEFT JOIN teams as at ON g.away_id = at.id WHERE g.season >= {first_data_season} and g.season <= {last_season} ORDER BY g.date ASC")
season_games_plyrs = execute_query(f"SELECT g.id as game_id, g.date, g.season, g.is_playoff, g.winner, g.home_id, g.away_id, pg.team_id, pg.player_id, pg.minutes, pg.pts, pg.fgm, pg.fga, pg.fg_pct, pg.fg3m, pg.fg3a, pg.fg3_pct, pg.ftm, pg.fta, pg.ft_pct, pg.oreb, pg.dreb, pg.reb, pg.ast, pg.stl, pg.blk, pg.tov, pg.pf, pg.plus_minus FROM playergames AS pg LEFT JOIN games as g on pg.game_id = g.id WHERE g.season >= {first_data_season} and g.season <= {last_season} ORDER BY g.date ASC")
teams = execute_query(f"SELECT * FROM teams")



In [5]:
season_games.head()

Unnamed: 0,id,date,season,is_playoff,winner,home_id,home_name,home_pts,home_fgm,home_fga,...,away_oreb,away_dreb,away_reb,away_ast,away_stl,away_blk,away_tov,away_pf,home_odds,away_odds
0,20200001,2002-10-29,2002,0,H,1610612753,Orlando Magic,95,36,81,...,11,35,46,19,10,1,20,22,,
1,20200002,2002-10-29,2002,0,H,1610612758,Sacramento Kings,94,36,80,...,17,29,46,14,7,5,24,20,,
2,20200003,2002-10-29,2002,0,A,1610612747,Los Angeles Lakers,82,27,85,...,11,41,52,15,15,11,20,26,,
3,20200004,2002-10-30,2002,0,A,1610612738,Boston Celtics,96,36,77,...,12,35,47,29,6,5,18,19,,
4,20200005,2002-10-30,2002,0,H,1610612755,Philadelphia 76ers,95,31,86,...,6,29,35,19,11,6,17,25,,


In [6]:
season_games_plyrs.head()

Unnamed: 0,game_id,date,season,is_playoff,winner,home_id,away_id,team_id,player_id,minutes,...,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,plus_minus
0,20200001,2002-10-29,2002,0,H,1610612753,1610612755,1610612755,95,9,...,0.0,0,2,2,0,0,0,1,1,-4
1,20200001,2002-10-29,2002,0,H,1610612753,1610612755,1610612755,243,37,...,0.833,0,6,6,5,4,0,5,4,-1
2,20200001,2002-10-29,2002,0,H,1610612753,1610612755,1610612753,255,33,...,0.889,0,6,6,7,0,0,3,0,6
3,20200001,2002-10-29,2002,0,H,1610612753,1610612755,1610612753,270,14,...,0.0,0,1,1,1,1,0,0,1,2
4,20200001,2002-10-29,2002,0,H,1610612753,1610612755,1610612753,353,34,...,1.0,3,4,7,3,2,0,2,2,8


In [7]:
teams.head()

Unnamed: 0,id,name,abbreviation
0,1610612737,Atlanta Hawks,ATL
1,1610612738,Boston Celtics,BOS
2,1610612739,Cleveland Cavaliers,CLE
3,1610612740,New Orleans Pelicans,NOP
4,1610612741,Chicago Bulls,CHI


In [33]:
def get_team_previous_games(season_games, season_games_plyrs, game_date, team_id, opp_id, teams_per, season, elo, scenario):    
    home_previous_games = season_games.loc[(season_games['home_id'] == team_id) & (season_games['date'] < game_date)]
    away_previous_games = season_games.loc[(season_games['away_id'] == team_id) & (season_games['date'] < game_date)]
    
    if len(home_previous_games.index) == 0 or len(away_previous_games.index) == 0:
        return None
    
    home_previous_games.rename(columns = {'home_id': 'team_id', 'home_name': 'team_name',
       'home_pts': 'team_pts', 'home_fgm': 'team_fgm', 'home_fga': 'team_fga', 'home_fg_pct': 'team_fg_pct', 'home_fg3m': 'team_fg3m',
       'home_fg3a': 'team_fg3a', 'home_fg3_pct': 'team_fg3_pct', 'home_ftm': 'team_ftm', 'home_fta': 'team_fta', 'home_ft_pct': 'team_ft_pct',
       'home_oreb': 'team_oreb', 'home_dreb': 'team_dreb', 'home_reb': 'team_reb', 'home_ast': 'team_ast', 'home_stl': 'team_stl',
       'home_blk': 'team_blk', 'home_tov': 'team_tov', 'home_pf': 'team_pf', 
                                          
       'away_id': 'opp_id', 'away_name': 'opp_name', 'away_pts': 'opp_pts',
       'away_fgm': 'opp_fgm', 'away_fga': 'opp_fga', 'away_fg_pct': 'opp_fg_pct', 'away_fg3m': 'opp_fg3m', 'away_fg3a': 'opp_fg3a',
       'away_fg3_pct': 'opp_fg3_pct', 'away_ftm': 'opp_ftm', 'away_fta': 'opp_fta', 'away_ft_pct': 'opp_ft_pct', 'away_oreb': 'opp_oreb',
       'away_dreb': 'opp_dreb', 'away_reb': 'opp_reb', 'away_ast': 'opp_ast', 'away_stl': 'opp_stl', 
       'away_blk': 'opp_blk', 'away_tov': 'opp_tov', 'away_pf': 'opp_pf', 
                                          
       'home_odds': 'team_odds', 'away_odds': 'opp_odds'}, inplace=True)
    home_previous_games['scenario'] = 'H'
    home_previous_games['WL'] = home_previous_games.apply(lambda row: 'W' if row.winner == row.scenario else 'L', axis=1)
    
    away_previous_games.rename(columns = {'away_id': 'team_id', 'away_name': 'team_name',
       'away_pts': 'team_pts', 'away_fgm': 'team_fgm', 'away_fga': 'team_fga', 'away_fg_pct': 'team_fg_pct', 'away_fg3m': 'team_fg3m',
       'away_fg3a': 'team_fg3a', 'away_fg3_pct': 'team_fg3_pct', 'away_ftm': 'team_ftm', 'away_fta': 'team_fta', 'away_ft_pct': 'team_ft_pct',
       'away_oreb': 'team_oreb', 'away_dreb': 'team_dreb', 'away_reb': 'team_reb', 'away_ast': 'team_ast', 'away_stl': 'team_stl',
       'away_blk': 'team_blk', 'away_tov': 'team_tov', 'away_pf': 'team_pf', 
                                          
       'home_id': 'opp_id', 'home_name': 'opp_name', 'home_pts': 'opp_pts',
       'home_fgm': 'opp_fgm', 'home_fga': 'opp_fga', 'home_fg_pct': 'opp_fg_pct', 'home_fg3m': 'opp_fg3m', 'home_fg3a': 'opp_fg3a',
       'home_fg3_pct': 'opp_fg3_pct', 'home_ftm': 'opp_ftm', 'home_fta': 'opp_fta', 'home_ft_pct': 'opp_ft_pct', 'home_oreb': 'opp_oreb',
       'home_dreb': 'opp_dreb', 'home_reb': 'opp_reb', 'home_ast': 'opp_ast', 'home_stl': 'opp_stl', 
       'home_blk': 'opp_blk', 'home_tov': 'opp_tov', 'home_pf': 'opp_pf',
                                          
       'home_odds': 'opp_odds', 'away_odds': 'team_odds'}, inplace=True)
    away_previous_games['scenario'] = 'A'
    away_previous_games['WL'] = away_previous_games.apply(lambda row: 'W' if row.winner == row.scenario else 'L', axis=1)
    
    previous_games = pd.concat([home_previous_games, away_previous_games], axis=0, ignore_index=True)
    previous_games.sort_values('date', inplace=True)
    
    previous_season_games = previous_games.loc[previous_games['season'] == season]
    
    if len(previous_season_games.index) < 10:
        return None
    
    last_n_games = previous_season_games.iloc[-n_last_games:,:]
    
    # Getting Previous A x B Matchups
    last_matchups = previous_games[previous_games['opp_id'] == opp_id].iloc[-10:,:]
    
    # Getting player information
    teams_per[team_id] = hf.get_team_per_mean(team_id, game_id, game_date, season, season_games_plyrs)
    
    # Season Win Percentage
    season_pct = hf.get_wl_pct(previous_season_games)[0]
    
    # Season H/A Win Percentage
    if scenario == 'H':
        ha_pct = hf.get_wl_pct(home_previous_games)[0]
    else:
        ha_pct = hf.get_wl_pct(away_previous_games)[0]
    
    # Matchup Win Percentage
    matchup_pct = hf.get_wl_pct(last_matchups)[0]
    
    # Calculating Current Streak
    streak = hf.current_streak(previous_season_games)
    
    stats_team = hf.get_team_stats (last_n_games, season_pct, teams_per[team_id], elo, matchup_pct, ha_pct, streak)
    
    return stats_team

In [34]:
def get_match_info(game_info, stats_team_a, stats_team_b, winner):
    return (game_info + stats_team_a + stats_team_b + [1 if winner == 'A' else 0])

In [35]:
print("Creating CSV file of all games...")

teams_per = dict()
elo_dic = dict()

for i, team in teams.iterrows():
    team_id = team['id']
    teams_per[team_id] = 0
    elo_dic[team['id']] = 1500

matches_organized = []
matches_organized_playoffs = []
season = ''    

season_games_iterr = season_games.loc[season_games['season'] >= first_season]
season_games_iterr.reset_index(drop=True, inplace=True)

for i, g in season_games_iterr.iterrows():
    clear_output(wait=True)
    print(f"{i}/{len(season_games_iterr.index)}")

    if season != '' and season != g['season']:
        hf.reset_season_elo(elo_dic)

    season = g['season']
    
    is_playoffs = g['is_playoff']
        
    game_id = g['id']
    game_date = g['date']

    team_a_id = g['home_id']
    team_b_id = g['away_id']

    team_a_abbv = teams.loc[teams['id'] == team_a_id].iloc[0]['abbreviation']
    team_b_abbv = teams.loc[teams['id'] == team_b_id].iloc[0]['abbreviation']
    
    winner = g['winner']
    
    # Update ELO after stats computed
    team_a_pts = g['home_pts']
    team_b_pts = g['away_pts']
    elo_a = elo_dic[team_a_id]
    elo_b = elo_dic[team_b_id]

    team_a_odds, team_b_odds = g['home_odds'], g['away_odds']
    
    stats_team_a = get_team_previous_games(season_games, season_games_plyrs, game_date, team_a_id, team_b_id, teams_per, season, elo_a, 'H')
    if not stats_team_a:
        continue
    
    stats_team_b = get_team_previous_games(season_games, season_games_plyrs, game_date, team_b_id, team_a_id, teams_per, season, elo_b, 'A')
    if not stats_team_b:
        continue

    match_info = get_match_info([season, game_date, team_a_abbv, team_b_abbv, team_a_odds, team_b_odds], stats_team_a, stats_team_b, winner)
    if is_playoffs:
        matches_organized_playoffs.append(match_info)
    else:
        matches_organized.append(match_info)

    hf.update_elo(winner, elo_a, elo_b, elo_dic, team_a_id, team_b_id, team_a_pts, team_b_pts)

17828/17829


In [36]:
print("Total matches: {}\nTotal playoffs matches: {}".format(len(matches_organized), len(matches_organized_playoffs)))

Total matches: 14469
Total playoffs matches: 1171


In [37]:
def parse_df_to_csv(dataframe, columns, path, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    final_df = pd.DataFrame(dataframe, columns=columns)
    final_df.to_csv("{}/{}".format(path, filename))

In [38]:
columns = ['SEASON_ID', 'GAME_DATE', 'TEAM_A', 'TEAM_B', 'ODDS_A', 'ODDS_B',
           'PTS_A', 'PTS_CON_A', 'FG_PCT_A', 'FG3_PCT_A', 'FT_PCT_A', 'REB_A', 'TOV_A', 'SEASON_A_PCT', 'PER_A', 'ELO_A', 'MATCHUP_A_PCT', 'HA_A_PCT', 'STREAK_A',
           'PTS_B', 'PTS_CON_B', 'FG_PCT_B', 'FG3_PCT_B', 'FT_PCT_B', 'REB_B', 'TOV_B', 'SEASON_B_PCT', 'PER_B', 'ELO_B', 'MATCHUP_B_PCT', 'HA_B_PCT', 'STREAK_B',
           'WINNER']
parse_df_to_csv(matches_organized, columns, '../data', '{}-{}.csv'.format(first_season, last_season-1))
parse_df_to_csv(matches_organized_playoffs, columns, '../data/playoffs', '{}-{}.csv'.format(first_season, last_season-1))