In [1]:
import pandas as pd
import numpy as np
from functools import reduce
import helper_functions as hf
from IPython.display import clear_output
import os
import mysql.connector
from config import conn_host, conn_database, conn_user, conn_password
import json

In [2]:
def connect_to_db():
    return mysql.connector.connect(host=conn_host, 
                                     database=conn_database,
                                     user=conn_user,
                                     password=conn_password)

def execute_query(query, read_only = True):
    resp = None
    try:
        db = connect_to_db()
        if read_only:
            resp = pd.read_sql_query(query, db)
        else:
            mycursor = db.cursor()
            mycursor.execute(query)

            db.commit()
        db.close()
    except Exception as e:
        print(e)
    return resp

In [3]:
first_season = 2012
last_season = 2022

totals_n_last_games = 10
n_last_games = 10
n_last_specific_games = 5

pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
season_games = execute_query(f"SELECT * from `nba-data`.games WHERE season >= {first_season} and season <= {last_season} ORDER BY date ASC;")
season_games_plyrs = execute_query(f"SELECT g.id as game_id, g.date, g.season, g.is_playoff, g.winner, g.home_id, g.away_id, pg.team_id, pg.player_id, pg.minutes, pg.pts, pg.fgm, pg.fga, pg.fg_pct, pg.fg3m, pg.fg3a, pg.fg3_pct, pg.ftm, pg.fta, pg.ft_pct, pg.oreb, pg.dreb, pg.reb, pg.ast, pg.stl, pg.blk, pg.tov, pg.pf, pg.plus_minus FROM player_games AS pg LEFT JOIN games as g on pg.game_id = g.id WHERE g.season >= {first_season} and g.season <= {last_season} ORDER BY g.date ASC")
teams = execute_query(f"SELECT * FROM teams")



In [5]:
season_games['home_off_rtg'] = season_games.apply(lambda row: hf.get_team_offensive_rating_game(row, 'H'), axis = 1)
season_games['home_def_rtg'] = season_games.apply(lambda row: hf.get_team_defensive_rating_game(row, 'H'), axis = 1)

season_games['away_off_rtg'] = season_games.apply(lambda row: hf.get_team_offensive_rating_game(row, 'A'), axis = 1)
season_games['away_def_rtg'] = season_games.apply(lambda row: hf.get_team_defensive_rating_game(row, 'A'), axis = 1)

In [6]:
season_games

Unnamed: 0,id,date,season,is_playoff,winner,home_id,home_team,home_pts,home_fgm,home_fga,...,away_stl,away_blk,away_tov,away_pf,home_odds,away_odds,home_off_rtg,home_def_rtg,away_off_rtg,away_def_rtg
0,21200001,2012-10-30,2012,0,H,1610612739,Cleveland Cavaliers,94,36,79,...,11,10,13,19,1.41,2.93,103.524229,92.511013,90.322581,101.075269
1,21200002,2012-10-30,2012,0,H,1610612748,Miami Heat,120,43,79,...,4,2,16,23,1.35,3.26,126.582278,112.869198,112.394958,126.050420
2,21200003,2012-10-30,2012,0,A,1610612747,Los Angeles Lakers,91,38,77,...,9,5,12,25,1.21,4.54,102.941176,111.990950,103.991597,95.588235
3,21200004,2012-10-31,2012,0,A,1610612761,Toronto Raptors,88,33,91,...,3,10,19,16,2.03,1.79,94.017094,96.153846,95.338983,93.220339
4,21200005,2012-10-31,2012,0,H,1610612755,Philadelphia 76ers,84,30,85,...,9,5,22,22,2.02,1.8,88.050314,78.616352,76.219512,85.365854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14126,42200401,2023-06-01,2022,1,H,1610612743,Denver Nuggets,104,40,79,...,5,4,8,15,1.28,3.86,114.285714,102.197802,99.147122,110.874200
14127,42200402,2023-06-04,2022,1,A,1610612743,Denver Nuggets,108,39,75,...,5,4,11,22,1.3,3.69,121.621622,125.000000,124.719101,121.348315
14128,42200403,2023-06-07,2022,1,A,1610612748,Miami Heat,94,34,92,...,3,5,14,18,2.38,1.62,100.427350,116.452991,118.736383,102.396514
14129,42200404,2023-06-09,2022,1,A,1610612748,Miami Heat,95,35,78,...,11,7,8,18,2.22,1.7,102.150538,116.129032,119.469027,105.088496


In [7]:
season_games_plyrs

Unnamed: 0,game_id,date,season,is_playoff,winner,home_id,away_id,team_id,player_id,minutes,...,ft_pct,oreb,dreb,reb,ast,stl,blk,tov,pf,plus_minus
0,21200001,2012-10-30,2012,0,H,1610612739,1610612764,1610612739,203079,28,...,0.75,0,2,2,0,3,0,3,0,13
1,21200001,2012-10-30,2012,0,H,1610612739,1610612764,1610612739,202681,35,...,0.80,0,6,6,3,0,1,4,4,23
2,21200001,2012-10-30,2012,0,H,1610612739,1610612764,1610612764,2731,25,...,0.50,5,2,7,0,0,4,1,1,-5
3,21200001,2012-10-30,2012,0,H,1610612739,1610612764,1610612739,2760,37,...,1.00,12,11,23,9,0,2,1,4,7
4,21200001,2012-10-30,2012,0,H,1610612739,1610612764,1610612739,2575,12,...,0.00,0,1,1,0,0,0,2,0,-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298802,42200405,2023-06-12,2022,1,H,1610612743,1610612748,1610612748,1629216,22,...,0.00,0,0,0,3,1,0,0,2,-14
298803,42200405,2023-06-12,2022,1,H,1610612743,1610612748,1610612748,1629312,3,...,0.00,0,1,1,0,0,0,0,1,0
298804,42200405,2023-06-12,2022,1,H,1610612743,1610612748,1610612748,1629622,32,...,1.00,1,7,8,1,0,1,1,3,2
298805,42200405,2023-06-12,2022,1,H,1610612743,1610612748,1610612748,200768,34,...,0.00,2,7,9,4,2,1,1,5,5


In [8]:
teams.head()

Unnamed: 0,id,name,abbreviation
0,1610612737,Atlanta Hawks,ATL
1,1610612738,Boston Celtics,BOS
2,1610612739,Cleveland Cavaliers,CLE
3,1610612740,New Orleans Pelicans,NOP
4,1610612741,Chicago Bulls,CHI


In [9]:
teams_elo_dict = dict()
teams_elo_path = 'teams_elo'

for i, team in teams.iterrows():
    f = open(f"{teams_elo_path}/{team['abbreviation']}.json")
    elo_dict = json.load(f)
    teams_elo_dict[team['id']] = elo_dict

In [10]:
def get_match_info(game_info, stats_team_a, stats_team_b, winner, team_a_pts, team_b_pts):
    return (game_info + stats_team_a + stats_team_b + [winner, team_a_pts, team_b_pts])

def get_team_previous_games(season_games, season_games_plyrs, game_date, team_id, opp_id, teams_per, season, scenario):    
    response = hf.get_team_previous_games(season_games, team_id, game_date, season)
    if not response: return None
    
    home_previous_games, away_previous_games, previous_games, previous_season_games, home_previous_season_games, away_previous_season_games = response
    
    if len(previous_season_games.index) < 10:
        return None
    
    last_n_games = previous_season_games.iloc[-n_last_games:,:]
    
    # Number of games in the last 4 days
    last_4_days = game_date - pd.Timedelta(days=4)
    number_games_last_4_days = len(last_n_games.loc[last_n_games['date'] >= last_4_days].index)
    
    # Get last game ELO
    last_game_date = str(previous_season_games.iloc[-1,:]['date'])
    elo = teams_elo_dict[team_id][last_game_date]
    
    # Last n games pct
    pct_last_n_games = hf.get_wl_pct(last_n_games)[0]
    
    # Getting Previous A x B Matchups
    last_matchups = previous_games[previous_games['opp_id'] == opp_id].iloc[-10:,:]
    
    # Getting player information
    teams_per[team_id] = hf.get_team_per_mean(team_id, game_id, game_date, season, season_games_plyrs)
    
    # Season Win Percentage
    season_pct = hf.get_wl_pct(previous_season_games)[0]
    
    # Last n/2 games pct and Season H/A Win Percentage
    if scenario == 'H':
        ha_pct_last_n_games = hf.get_wl_pct(home_previous_season_games.iloc[-n_last_specific_games:,:])[0]
        ha_pct = hf.get_wl_pct(home_previous_season_games)[0]
    else:
        ha_pct_last_n_games = hf.get_wl_pct(away_previous_season_games.iloc[-n_last_specific_games:,:])[0]
        ha_pct = hf.get_wl_pct(away_previous_season_games)[0]
    
    # Matchup Win Percentage
    matchup_pct = hf.get_wl_pct(last_matchups)[0]
    
    # Calculating Current Streak
    streak = hf.current_streak(previous_season_games)
    
    stats_team = hf.get_team_stats (last_n_games, season_pct, teams_per[team_id], elo, matchup_pct, ha_pct, streak, pct_last_n_games, ha_pct_last_n_games)
    
    return stats_team

In [11]:
print("Creating CSV file of all games...")

teams_per = dict()

matches_organized = []
matches_organized_playoffs = []
season = ''

season_games_iterr = season_games.loc[season_games['season'] >= first_season]
season_games_iterr.reset_index(drop=True, inplace=True)

for i, g in season_games_iterr.iterrows():
    clear_output(wait=True)
    print(f"{i}/{len(season_games_iterr.index)}")
    
    season = g['season']
    
    if season < first_season:
        continue

    is_playoffs = int(g['is_playoff'])
        
    game_id = g['id']
    game_date = g['date']

    team_a_id = g['home_id']
    team_b_id = g['away_id']

    team_a_abbv = teams.loc[teams['id'] == team_a_id].iloc[0]['abbreviation']
    team_b_abbv = teams.loc[teams['id'] == team_b_id].iloc[0]['abbreviation']
    
    winner = g['winner']
    
    # Update ELO after stats computed
    team_a_pts = g['home_pts']
    team_b_pts = g['away_pts']

    team_a_odds, team_b_odds = g['home_odds'], g['away_odds']
    
    if not team_a_odds or not team_b_odds:
        continue
    
    stats_team_a = get_team_previous_games(season_games, season_games_plyrs, game_date, team_a_id, team_b_id, teams_per, season, 'H')
    if not stats_team_a:
        continue
    
    stats_team_b = get_team_previous_games(season_games, season_games_plyrs, game_date, team_b_id, team_a_id, teams_per, season, 'A')
    if not stats_team_b:
        continue

    match_info = get_match_info([season, game_date, team_a_abbv, team_b_abbv, team_a_odds, team_b_odds], stats_team_a, stats_team_b, winner, team_a_pts, team_b_pts)
    if is_playoffs:
        matches_organized_playoffs.append(match_info)
    else:
        matches_organized.append(match_info)

14130/14131


In [12]:
print("Total matches: {}\nTotal playoffs matches: {}".format(len(matches_organized), len(matches_organized_playoffs)))

Total matches: 11490
Total playoffs matches: 918


In [13]:
def parse_df_to_csv(dataframe, columns, path, filename):
    if not os.path.exists(path):
        os.makedirs(path)
    final_df = pd.DataFrame(dataframe, columns=columns)
    final_df.to_csv("{}/{}".format(path, filename))
    return final_df

In [14]:
columns = ['SEASON_ID', 'GAME_DATE', 'TEAM_A', 'TEAM_B', 'ODDS_A', 'ODDS_B',
           'PTS_A', 'PTS_CON_A', 'FG_PCT_A', 'FG3_PCT_A', 'FT_PCT_A', 'REB_A', 'TOV_A', 'SEASON_A_PCT', 'PER_A', 'ELO_A', 'MATCHUP_A_PCT', 'HA_A_PCT', 'STREAK_A', 'LAST_GAMES_PCT_A', 'HA_LAST_GAMES_PCT_A', 'OFF_RTG_A', 'DEF_RTG_A',
           'PTS_B', 'PTS_CON_B', 'FG_PCT_B', 'FG3_PCT_B', 'FT_PCT_B', 'REB_B', 'TOV_B', 'SEASON_B_PCT', 'PER_B', 'ELO_B', 'MATCHUP_B_PCT', 'HA_B_PCT', 'STREAK_B', 'LAST_GAMES_PCT_B', 'HA_LAST_GAMES_PCT_B', 'OFF_RTG_B', 'DEF_RTG_B',
           'WINNER', 'GAME_PTS_A', 'GAME_PTS_B']
final_df = parse_df_to_csv(matches_organized, columns, '../data', '{}-{}.csv'.format(first_season, last_season))
parse_df_to_csv(matches_organized_playoffs, columns, '../data/playoffs', '{}-{}.csv'.format(first_season, last_season))

Unnamed: 0,SEASON_ID,GAME_DATE,TEAM_A,TEAM_B,ODDS_A,ODDS_B,PTS_A,PTS_CON_A,FG_PCT_A,FG3_PCT_A,...,MATCHUP_B_PCT,HA_B_PCT,STREAK_B,LAST_GAMES_PCT_B,HA_LAST_GAMES_PCT_B,OFF_RTG_B,DEF_RTG_B,WINNER,GAME_PTS_A,GAME_PTS_B
0,2012,2013-04-20,NYK,BOS,1.27,3.82,103.8,96.1,0.4755,0.3965,...,0.25,0.341463,-1,0.4,0.2,107.244769,109.169321,H,85,78
1,2012,2013-04-20,BKN,CHI,1.49,2.66,103.3,97.7,0.4792,0.3665,...,0.75,0.512195,-2,0.5,0.4,103.291334,104.509349,H,106,89
2,2012,2013-04-20,DEN,GSW,1.26,3.98,109.7,100.3,0.4932,0.3244,...,0.25,0.463415,-2,0.6,0.6,109.127075,104.646215,H,97,95
3,2012,2013-04-20,LAC,MEM,1.41,2.97,102.7,96.7,0.4901,0.3789,...,0.25,0.585366,-2,0.8,0.8,103.376255,94.728956,H,112,91
4,2012,2013-04-21,MIA,MIL,1.04,11.92,98.5,91.7,0.4869,0.4180,...,0.25,0.414634,-1,0.3,0.2,101.229212,102.570879,H,110,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,2022,2023-06-01,DEN,MIA,1.28,3.86,117.5,109.4,0.4954,0.3803,...,0.10,0.450980,-1,0.6,0.6,112.661927,109.783116,H,104,93
914,2022,2023-06-04,DEN,MIA,1.3,3.69,115.4,108.0,0.4985,0.3667,...,0.10,0.442308,-1,0.5,0.6,110.956170,110.102945,A,108,111
915,2022,2023-06-07,MIA,DEN,2.38,1.62,106.4,104.1,0.4653,0.4091,...,0.80,0.479167,-1,0.7,0.6,121.162081,114.817795,A,94,109
916,2022,2023-06-09,MIA,DEN,2.22,1.7,106.2,105.8,0.4621,0.4146,...,0.80,0.489796,-1,0.8,0.8,121.902907,113.241040,A,95,108
