In [27]:
import pandas as pd
import numpy as np
from functools import reduce
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.static import teams 
import helper_functions as hf

In [28]:
pd.options.mode.chained_assignment = None  # default='warn'
teams = teams.get_teams()

teams_per = dict()

for team in teams:
    team_id = team['id']
    teams_per[team_id] = 0

seasons_teams = []
seasons_players = []
first_season = 2020
last_season = 2021
first_season_id = 20000 + first_season

print("Getting NBA Seasons Information...")
for i in range(first_season,last_season):
    season_i_teams = leaguegamelog.LeagueGameLog(season = str(i)).get_data_frames()[0]
    season_i_players = leaguegamelog.LeagueGameLog(season = str(i), player_or_team_abbreviation = 'P').get_data_frames()[0]
    seasons_teams.append(season_i_teams)
    seasons_players.append(season_i_players)
    print("{}/{}".format(i,last_season))


dfs = []

print("Cleaning the data...")

season_games = reduce(lambda  left,right: pd.merge(left,right, how='outer'), seasons_teams)
season_games_plyrs = reduce(lambda  left,right: pd.merge(left,right, how='outer'), seasons_players)
season_games.dropna(subset=['FG_PCT','FT_PCT','FG3_PCT'], inplace=True)

season_games_plyrs['GAME_ID'] = pd.to_numeric(season_games_plyrs['GAME_ID'])
season_games['GAME_ID'] = pd.to_numeric(season_games['GAME_ID'])
season_games['GAME_DATE'] = pd.to_datetime(season_games['GAME_DATE'])
season_games_plyrs['GAME_DATE'] = pd.to_datetime(season_games_plyrs['GAME_DATE'])

print('size', len(season_games.index))

Getting NBA Seasons Information...
2020/2021
Cleaning the data...
size 2160


In [29]:
print("Initializing ELOs...")

elo_dic = dict()

for team in teams:
    elo_dic[team['id']] = 1500

matches_organized = []
matches_organized_lstm = []
matches_organized_regression = []

season_id = ''    
print('Getting historical odds...')
odds = hf.load_bets_csv()

Initializing ELOs...
Getting historical odds...


In [30]:
print("Creating CSV file of all games...")
for i, g in season_games.groupby(season_games.index // 2):
    print("{}/{}".format(i, len(season_games.index) // 2))
    if g.iloc[[0],:].iloc[0]['WL'] == None:
        break

    if season_id != '' and season_id != g.iloc[[0],:].iloc[0]['SEASON_ID']:
        hf.reset_season_elo(season_id, g, elo_dic)

    season_id = g.iloc[[0],:].iloc[0]['SEASON_ID']
    game_id = g.iloc[[0],:].iloc[0]['GAME_ID']
    game_date = g.iloc[[0],:].iloc[0]['GAME_DATE']

    team_a_id = g.iloc[[0],:].iloc[0]['TEAM_ID']
    team_b_id = g.iloc[1:2,:].iloc[0]['TEAM_ID']

    team_a_abbv = g.iloc[[0],:].iloc[0]['TEAM_ABBREVIATION']
    team_b_abbv = g.iloc[1:2,:].iloc[0]['TEAM_ABBREVIATION']

    winner = 'B'

    if g.iloc[[0],:].iloc[0]['WL'] == 'W':
        winner = 'A'

    if '@' in g.iloc[[0],:].iloc[0]['MATCHUP']:
        team_b_odds, team_a_odds = hf.get_teams_odds(team_b_id, team_a_id, game_date, odds)
    else:
        team_a_odds, team_b_odds = hf.get_teams_odds(team_a_id, team_b_id, game_date, odds)

    team_a_previous_games = season_games.loc[(season_games['TEAM_ID'] == team_a_id) & (season_games['GAME_DATE'] < game_date)]
    team_b_previous_games = season_games.loc[(season_games['TEAM_ID'] == team_b_id) & (season_games['GAME_DATE'] < game_date)]
    team_a_season_games = team_a_previous_games.loc[team_a_previous_games['SEASON_ID'] == season_id]
    team_b_season_games = team_b_previous_games.loc[team_b_previous_games['SEASON_ID'] == season_id]

    # Getting teams last 10 games
    team_a_previous_10_games = team_a_season_games.iloc[-10:,:]
    team_b_previous_10_games = team_b_season_games.iloc[-10:,:]

    # Getting Home/Away information
    if '@' in g.iloc[[0],:].iloc[0]['MATCHUP']:
        team_a_last_ha_games = team_a_season_games[team_a_season_games['MATCHUP'].str.contains('@')].iloc[-10:,:]
        team_b_last_ha_games = team_b_season_games[~team_b_season_games['MATCHUP'].str.contains('@')].iloc[-10:,:]
    else:
        team_a_last_ha_games = team_a_season_games[~team_a_season_games['MATCHUP'].str.contains('@')].iloc[-10:,:]
        team_b_last_ha_games = team_b_season_games[team_b_season_games['MATCHUP'].str.contains('@')].iloc[-10:,:]

    # Getting Previous A x B Matchups
    last_matchups = team_a_previous_games[team_a_previous_games['MATCHUP'].str.contains(team_a_abbv) & 
                                          team_a_previous_games['MATCHUP'].str.contains(team_b_abbv)].iloc[-10:,:]

    if len(team_a_previous_games.index) > 0:
        if team_a_previous_games.iloc[-1]['GAME_ID'] == g.iloc[[0],:].iloc[0]['GAME_ID']:
            print('SÃ£o iguais', i*2)
            break

    # Update ELO after stats computed
    team_a_pts = g.iloc[[0],:].iloc[0]['PTS']
    team_b_pts = g.iloc[1:2,:].iloc[0]['PTS']
    elo_a = elo_dic[team_a_id]
    elo_b = elo_dic[team_b_id]

    if not (int(season_id) >= first_season_id and len(team_a_previous_10_games.index) >= 5 and len(team_b_previous_10_games.index) >= 5 and len(team_a_last_ha_games.index) >= 2 and len(team_b_last_ha_games.index) >= 2 and len(last_matchups.index) > 0 and team_a_odds != None and team_b_odds != None):
        print("Not enough games.".format(len(team_a_previous_10_games.index), len(team_b_previous_10_games.index), len(team_a_last_ha_games.index), len(team_b_last_ha_games.index)))
        hf.update_elo(winner, elo_a, elo_b, elo_dic, team_a_id, team_b_id, team_a_pts, team_b_pts)
        continue

    # Getting player information
    teams_per[team_a_id] = hf.get_team_per_mean(team_a_id, game_id, game_date, season_id, season_games_plyrs)
    teams_per[team_b_id] = hf.get_team_per_mean(team_b_id, game_id, game_date, season_id, season_games_plyrs)

    # Season Win Percentage
    team_a_season_pct = hf.get_wl_pct(team_a_season_games)[0]
    team_b_season_pct = hf.get_wl_pct(team_b_season_games)[0]

    # Calculating Current Streak
    team_a_streak = hf.current_streak(team_a_season_games)
    team_b_streak = hf.current_streak(team_b_season_games)

    team_a_last_matchups_percentage, team_b_last_matchups_percentage = hf.get_wl_pct(last_matchups)

    team_a_ha_percentage = hf.get_wl_pct(team_a_last_ha_games)[0]
    team_b_ha_percentage = hf.get_wl_pct(team_b_last_ha_games)[0]

    # Poins Conceded
    team_a_previous_games_pts_conceded = hf.team_points_conceded(team_a_previous_10_games, season_games)
    team_b_previous_games_pts_conceded = hf.team_points_conceded(team_b_previous_10_games, season_games)

    # HA Points Conceded
    team_a_ha_previous_games_pts_conceded = hf.team_points_conceded(team_a_last_ha_games, season_games)
    team_b_ha_previous_games_pts_conceded = hf.team_points_conceded(team_b_last_ha_games, season_games)

    # Defining list of stats for each team
    stats_team_a = hf.get_team_stats (team_a_previous_10_games, team_a_previous_games_pts_conceded, team_a_season_pct, team_a_ha_percentage, team_a_streak, team_a_last_matchups_percentage, elo_a, teams_per[team_a_id], team_a_odds)
    stats_team_b = hf.get_team_stats (team_b_previous_10_games, team_b_previous_games_pts_conceded, team_b_season_pct, team_b_ha_percentage, team_b_streak, team_b_last_matchups_percentage, elo_b, teams_per[team_b_id], team_b_odds)

    stats_team_a_regression = hf.get_team_stats_regression (team_a_previous_10_games, team_a_previous_games_pts_conceded, team_a_season_games, elo_a, teams_per[team_a_id], team_a_last_ha_games, team_a_ha_previous_games_pts_conceded)
    stats_team_b_regression = hf.get_team_stats_regression (team_b_previous_10_games, team_b_previous_games_pts_conceded, team_b_season_games, elo_b, teams_per[team_b_id], team_b_last_ha_games, team_b_ha_previous_games_pts_conceded)

    if '@' in g.iloc[[0],:].iloc[0]['MATCHUP']:
        matches_organized.append([season_id, game_date, team_b_abbv, team_a_abbv] + stats_team_b + stats_team_a + [1 if winner == 'B' else 0])
        matches_organized_regression.append([season_id, game_date, team_b_abbv, team_a_abbv] + stats_team_b_regression + stats_team_a_regression + [team_b_pts, team_a_pts])
    else:
        matches_organized.append([season_id, game_date, team_a_abbv, team_b_abbv] + stats_team_a + stats_team_b + [1 if winner == 'A' else 0])
        matches_organized_regression.append([season_id, game_date, team_a_abbv, team_b_abbv] + stats_team_a_regression + stats_team_b_regression + [team_a_pts, team_b_pts])


    matches_organized_lstm.append([team_a_abbv, team_a_id, game_date, team_a_pts, team_b_pts, g.iloc[[0],:].iloc[0]['FG_PCT'], g.iloc[[0],:].iloc[0]['FG3_PCT'], 
                    g.iloc[[0],:].iloc[0]['FT_PCT'], g.iloc[[0],:].iloc[0]['REB'], g.iloc[[0],:].iloc[0]['TOV'],
                    g.iloc[[0],:].iloc[0]['BLK'], team_a_season_pct, team_a_ha_percentage, elo_a, elo_b, team_a_streak,
                     teams_per[team_a_id], team_a_odds, team_b_odds, 1 if winner == 'A' else 0])

    matches_organized_lstm.append([team_b_abbv, team_b_id, game_date, team_b_pts, team_a_pts, g.iloc[1:2,:].iloc[0]['FG_PCT'], g.iloc[1:2,:].iloc[0]['FG3_PCT'], 
                    g.iloc[1:2,:].iloc[0]['FT_PCT'], g.iloc[1:2,:].iloc[0]['REB'], g.iloc[1:2,:].iloc[0]['TOV'],
                    g.iloc[1:2,:].iloc[0]['BLK'], team_b_season_pct, team_b_ha_percentage, elo_b, elo_a, team_b_streak,
                     teams_per[team_b_id], team_b_odds, team_a_odds, 1 if winner == 'B' else 0])


    hf.update_elo(winner, elo_a, elo_b, elo_dic, team_a_id, team_b_id, team_a_pts, team_b_pts)

Creating CSV file of all games...
0/1080
Not enough games.
1/1080
Not enough games.
2/1080
Not enough games.
3/1080
Not enough games.
4/1080
Not enough games.
5/1080
Not enough games.
6/1080
Not enough games.
7/1080
Not enough games.
8/1080
Not enough games.
9/1080
Not enough games.
10/1080
Not enough games.
11/1080
Not enough games.
12/1080
Not enough games.
13/1080
Not enough games.
14/1080
Not enough games.
15/1080
Not enough games.
16/1080
Not enough games.
17/1080
Not enough games.
18/1080
Not enough games.
19/1080
Not enough games.
20/1080
Not enough games.
21/1080
Not enough games.
22/1080
Not enough games.
23/1080
Not enough games.
24/1080
Not enough games.
25/1080
Not enough games.
26/1080
Not enough games.
27/1080
Not enough games.
28/1080
Not enough games.
29/1080
Not enough games.
30/1080
Not enough games.
31/1080
Not enough games.
32/1080
Not enough games.
33/1080
Not enough games.
34/1080
Not enough games.
35/1080
Not enough games.
36/1080
Not enough games.
37/1080
Not en

373/1080
374/1080
375/1080
376/1080
Not enough games.
377/1080
378/1080
Not enough games.
379/1080
Not enough games.
380/1080
381/1080
382/1080
Not enough games.
383/1080
Not enough games.
384/1080
385/1080
Not enough games.
386/1080
Not enough games.
387/1080
Not enough games.
388/1080
Not enough games.
389/1080
Not enough games.
390/1080
391/1080
Not enough games.
392/1080
Not enough games.
393/1080
Not enough games.
394/1080
395/1080
Not enough games.
396/1080
397/1080
Not enough games.
398/1080
Not enough games.
399/1080
Not enough games.
400/1080
401/1080
402/1080
Not enough games.
403/1080
404/1080
Not enough games.
405/1080
406/1080
Not enough games.
407/1080
Not enough games.
408/1080
409/1080
410/1080
411/1080
412/1080
Not enough games.
413/1080
Not enough games.
414/1080
Not enough games.
415/1080
Not enough games.
416/1080
417/1080
418/1080
Not enough games.
419/1080
Not enough games.
420/1080
Not enough games.
421/1080
Not enough games.
422/1080
Not enough games.
423/1080
4

950/1080
951/1080
952/1080
953/1080
954/1080
955/1080
956/1080
957/1080
958/1080
959/1080
960/1080
961/1080
962/1080
963/1080
964/1080
965/1080
966/1080
967/1080
968/1080
969/1080
970/1080
Not enough games.
971/1080
Not enough games.
972/1080
973/1080
974/1080
975/1080
976/1080
977/1080
Not enough games.
978/1080
979/1080
980/1080
981/1080
982/1080
983/1080
984/1080
985/1080
986/1080
987/1080
988/1080
989/1080
990/1080
991/1080
992/1080
993/1080
994/1080
995/1080
996/1080
997/1080
998/1080
999/1080
1000/1080
1001/1080
Not enough games.
1002/1080
1003/1080
1004/1080
1005/1080
1006/1080
1007/1080
1008/1080
1009/1080
1010/1080
1011/1080
1012/1080
1013/1080
1014/1080
1015/1080
1016/1080
1017/1080
1018/1080
1019/1080
1020/1080
1021/1080
1022/1080
1023/1080
1024/1080
1025/1080
1026/1080
1027/1080
1028/1080
1029/1080
1030/1080
1031/1080
1032/1080
1033/1080
1034/1080
1035/1080
1036/1080
1037/1080
1038/1080
1039/1080
1040/1080
1041/1080
1042/1080
1043/1080
1044/1080
1045/1080
1046/1080
1047/108

In [31]:
final_df = pd.DataFrame(matches_organized, columns=['SEASON_ID', 'GAME_DATE', 'TEAM_A', 'TEAM_B',
                                                    'PTS_A', 'PTS_CON_A', 'FG_PCT_A', 'FG3_PCT_A', 'FT_PCT_A', 'REB_A', 'TOV_A', 'BLK_A', 'SEASON_A_PCT', 'H/A_A', 'ELO_A', 'STREAK_A', 'MATCHUP_A', 'PER_A', 'ODDS_A',
                                                    'PTS_B', 'PTS_CON_B', 'FG_PCT_B', 'FG3_PCT_B', 'FT_PCT_B', 'REB_B', 'TOV_B', 'BLK_B', 'SEASON_B_PCT', 'H/A_B', 'ELO_B', 'STREAK_B', 'MATCHUP_B', 'PER_B', 'ODDS_B',
                                                    'WINNER'])
final_df_lstm = pd.DataFrame(matches_organized_lstm, columns=['TEAM_ABBV', 'TEAM_ID', 'DATE',
                                                    'PTS_A', 'PTS_CON_A', 'FG_PCT_A', 'FG3_PCT_A', 'FT_PCT_A', 'REB_A', 'TOV_A', 'BLK_A', 
                                                    'SEASON_A_PCT', 'H/A_A', 'ELO_A', 'ELO_OPP', 'STREAK_A', 'PER_A', 'ODDS_A', 'ODDS_OPP',
                                                    'WINNER'])
final_df_regression = pd.DataFrame(matches_organized_regression, columns=['SEASON_ID', 'GAME_DATE', 'TEAM_A', 'TEAM_B',
                                                    'PTS_A', 'PTS_CON_A', 'FT_PCT_A', 'FG_PCT_A', 'FG3_PCT_A', 'ELO_A', 'PER_A', 'HA_PTS_A', 'HA_PTS_CON_A', 'SEASON_PTS_A',
                                                    'PTS_B', 'PTS_CON_B', 'FT_PCT_B', 'FG_PCT_B', 'FG3_PCT_B', 'ELO_B', 'PER_B', 'HA_PTS_B', 'HA_PTS_CON_B', 'SEASON_PTS_B',
                                                    'SCORE_A', 'SCORE_B'])
final_df_regression.to_csv('../data/{}-{}.csv'.format(first_season, last_season-1))
final_df.to_csv('../data/{}-{}.csv'.format(first_season, last_season-1))
final_df_lstm.to_csv('../data/LSTM/{}-{}.csv'.format(first_season, last_season-1))