In [1]:
from sqlalchemy import create_engine
import json
import psycopg2
import pandas as pd

##################### Credentialing and support setup
path_support = r'D:\Environments\mathletics\Support\\'  # Directory to support files
list_seasons = ["2021-22","2022-23","2023-24","2024-25"]

with open(path_support+"config.json",'r') as js:        # Opens json file as TextIOWrapper...
    contents_json = js.read()                           # ...reads TextIOWrapper object and stores as string...
    config_dict = json.loads(contents_json)             # ...converts string to dictionary

##################### Connection Details
database = 'mathletics'                                               
host = 'localhost'
user = 'postgres'
pgsql_pw = config_dict["postgresql_pw"]
con_string = f"postgresql://{user}:{pgsql_pw}@{host}/{database}"
engine = create_engine(con_string)
con = psycopg2.connect(con_string)

In [2]:
########### Function Block
   

def write_to_log(file_name,text):
    file = path_support+f"{file_name}.txt"                       # Setup variable file name

    with open(file,"a+") as f:
        f.write(f"{text}\n")

In [3]:
# Examine pts scored
df_teams = pd.read_sql_query("select * from teams",con=engine)  # For assigning names to id's
df_teams.set_index("team_id",inplace=True)

for season in list_seasons[-1:]:
    print(season)
    dict_pts = {}
    table_var = season.replace("-","")      # Season formatted for table query
    df_events = pd.read_sql_query(f"select * from events_{table_var}",con=engine)
    df_events.set_index("game_id",inplace=True)
    list_events_teams = df_events["home_id"].unique()   # For iterating
    
    for team in list_events_teams:
        name = df_teams["team_name"].loc[team]
        
        df_home = df_events[ (df_events["home_id"]==team) ]
        home_games = len(df_home)
        home_avg_pts = int(round(df_home["home_pts"].mean(),0))
        home_std_pts = int(round(df_home["home_pts"].std()))
        home_avg_all = int(round(df_home["away_pts"].mean(),0))
        home_std_all = int(round(df_home["away_pts"].std()))
        
        df_away = df_events[ (df_events["away_id"]==team) ]
        away_games = len(df_away)        
        away_avg_pts = int(round(df_away["away_pts"].mean(),0))
        away_std_pts = int(round(df_away["away_pts"].std()))
        away_avg_all = int(round(df_away["home_pts"].mean(),0))
        away_std_all = int(round(df_away["home_pts"].std()))

        print(name,"Scored:\t",home_avg_pts,home_avg_pts-away_avg_pts,home_std_pts,home_std_pts-away_std_pts)
        print(name,"Allowed:\t",home_avg_all,home_avg_all-away_avg_all,home_std_all,home_std_all-away_std_all)
        print()
    


2024-25


ProgrammingError: (psycopg2.errors.UndefinedTable) relation "events_202425" does not exist
LINE 1: select * from events_202425
                      ^

[SQL: select * from events_202425]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [4]:
# Start with df by team sorted by game_time
# Create home/away column
# Points scored/allowed columns
# Create game_num column
# Union all team df's, will make charting by game easier
pd.options.mode.chained_assignment = None  # default='warn'

df_teams = pd.read_sql_query("select * from teams",con=engine)  # For assigning names to id's
df_teams.set_index("team_id",inplace=True)

for season in list_seasons[-1:]:
    table_var = season.replace("-","")      # Season formatted for table query
    df_events = pd.read_sql_query(f"select * from events_{table_var}",con=engine)
    df_events.set_index("game_id",inplace=True)
    df_events.sort_values("game_time_utc",inplace=True)
    list_events_teams = df_events["home_id"].unique()   # For iterating
    df_by_league = pd.DataFrame()
    list_of_dfs = []
    
    for team in list_events_teams:
        name = df_teams["team_name"].loc[team]

        # All team records
        df_team_all = df_events[ (df_events["home_id"]==team) | (df_events["away_id"]==team) ]
        df_team_all["game_num"] = df_team_all["game_time_utc"].expanding().count()
        df_team_all["game_num"] = df_team_all["game_num"].astype(int)
        
        # As home team
        df_home = df_team_all[ df_team_all["home_id"]==team ]
        df_home["flag"] = "Home"
        df_home.rename(columns={"home_pts":"pts",
                                "home_id":"team",
                                "away_id":"opponent",
                                "away_pts":"allowed"},inplace=True)
        df_home["team"] = df_home["team"].apply(lambda x: df_teams["team_name"].loc[x])
        df_home["opponent"] = df_home["opponent"].apply(lambda x: df_teams["team_name"].loc[x])
        
        # As away team   
        df_away = df_team_all[ df_team_all["away_id"]==team ]
        df_away["flag"] = "Away"
        df_away.rename(columns={"away_pts":"pts",
                                "away_id":"team",
                                "home_id":"opponent",
                                "home_pts":"allowed"},inplace=True)
        df_away["team"] = df_away["team"].apply(lambda x: df_teams["team_name"].loc[x])
        df_away["opponent"] = df_away["opponent"].apply(lambda x: df_teams["team_name"].loc[x])        
        df_h_a = pd.concat([df_home,df_away])
        list_of_dfs.append(df_h_a) 
        
df_by_league = pd.concat(list_of_dfs)
print(df_by_league[ df_by_league["game_type"]=="Regular Season" ])

        
         

ProgrammingError: (psycopg2.errors.UndefinedTable) relation "events_202425" does not exist
LINE 1: select * from events_202425
                      ^

[SQL: select * from events_202425]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [17]:
rolling = 5

def pop_std(x):
    return x.std(ddof=0)

df_league_grouped = df_by_league.groupby("game_num")['pts'].agg(['mean',pop_std])
df_league_grouped["cum_mean"] = df_league_grouped["mean"].expanding().mean()
df_league_grouped["cum_std"] = df_league_grouped["mean"].expanding().std(ddof=0) 
df_league_grouped["rolling_avg"] = df_league_grouped["mean"].rolling(rolling).mean()
df_league_grouped["rolling_std"] = df_league_grouped["mean"].rolling(rolling).std(ddof=0)
df_league_grouped[-10:]

Unnamed: 0_level_0,mean,pop_std,cum_mean,cum_std,rolling_avg,rolling_std
game_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
82,113.733333,15.684245,114.211382,2.819273,113.546667,1.823135
83,106.05,11.182464,114.113052,2.940302,111.51,2.995359
84,102.5,10.001389,113.974802,3.182586,109.89,4.734784
85,105.6875,12.066062,113.877304,3.287579,108.4275,4.677291
86,108.875,10.862061,113.819138,3.312111,107.369167,3.769539
87,104.333333,12.28911,113.710105,3.444758,105.489167,2.101847
88,105.333333,10.017762,113.614915,3.538339,105.345833,2.083067
89,100.666667,13.904436,113.469429,3.77383,104.979167,2.638918
90,116.25,12.949421,113.500324,3.764107,107.091667,5.273255
91,118.0,0.0,113.549771,3.772646,108.916667,6.902093


In [18]:
import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8")

def save_images(df,team):
    #df["grouped_mean"] = df_league_grouped["mean"]
    df[["pts","allowed"]].plot(figsize=(12,8),
                                title=f"""{team} - Cum_AVG = {df.cum_avg.iloc[-1]}; Cum_STD = {df.cum_std.iloc[-1]}     League Max STD: {int(round(df_league_grouped["pop_std"].max(),0))}, League Min STD: {int(round(df_league_grouped["pop_std"].min(),0))}""",
                                fontsize=12,
                                color=['g','r'])
    
    plt.fill_between(df_league_grouped.index,
                    df_league_grouped["mean"]-df_league_grouped["pop_std"],
                    df_league_grouped["mean"]+df_league_grouped["pop_std"],
                    alpha=.2)

    plt.savefig(path_support+f"Images\\{team}.png",bbox_inches="tight")
    plt.close()     # To avoid warning about potential memory
    return print(f"{team} image saved")


def save_images_2(df,team):
    df["league_mean"] = df_league_grouped["cum_mean"]
    curr_avg = int(round(df.cum_avg.iloc[-1],0))
    curr_std = int(round(df.cum_std.iloc[-1],0))
    curr_allowed = int(round(df.cum_avg_allowed.iloc[-1],0))
    curr_std_allowed = int(round(df.cum_std_allowed.iloc[-1],0))
    league_avg = int(round(df_league_grouped.cum_mean.iloc[-1],0))
    df[["cum_avg","cum_avg_allowed","league_mean"]].plot(figsize=(12,8),
        title=f"""{team}\nCurrent Avg. = {curr_avg}; Current STD = {curr_std}\nCurrent Avg. Allowed: {curr_allowed}; Current STD Allowed: {curr_std_allowed}\nLeague Current Avg: {league_avg}""",
                                fontsize=12,
                                color=['g','r','b'])

    plt.fill_between(df.index,
                    df.cum_avg - df.cum_std,
                    df.cum_avg + df.cum_std,
                    color='g',
                    alpha=.2)
    plt.fill_between(df.index,
                    df.cum_avg_allowed - df.cum_std_allowed,
                    df.cum_avg_allowed + df.cum_std_allowed,
                    color='r',
                    alpha=.2)
    plt.ylim(df_by_league.pts.min(),
             df_by_league.pts.max())

    plt.savefig(path_support+f"Images\\{team}.png",bbox_inches="tight")
    plt.close()     # To avoid warning about potential memory
    return print(f"{team} image saved")


def save_images_3(df,team):
    df["league_rolling"] = df_league_grouped["rolling_avg"]
    curr_avg = int(round(df.rolling_avg.iloc[-1],0))
    curr_std = int(round(df.rolling_std.iloc[-1],0))
    curr_allowed = int(round(df.rolling_avg_allowed.iloc[-1],0))
    curr_std_allowed = int(round(df.rolling_std_allowed.iloc[-1],0))
    league_avg = int(round(df_league_grouped.rolling_avg.iloc[-1],0))
    df[["rolling_avg","rolling_avg_allowed","league_rolling"]].plot(figsize=(12,8),
        title=f"""{team}\nCurrent Avg. = {curr_avg}; Current STD = {curr_std}\nCurrent Avg. Allowed: {curr_allowed}; Current STD Allowed: {curr_std_allowed}\nLeague Current Avg: {league_avg}""",
                                fontsize=12,
                                color=['g','r','b'])

    plt.fill_between(df.index,
                    df.rolling_avg - df.rolling_std,
                    df.rolling_avg + df.rolling_std,
                    color='g',
                    alpha=.2)
    plt.fill_between(df.index,
                    df.rolling_avg_allowed - df.rolling_std_allowed,
                    df.rolling_avg_allowed + df.rolling_std_allowed,
                    color='r',
                    alpha=.2)
    plt.ylim(df_by_league.pts.min(),
             df_by_league.pts.max())

    plt.savefig(path_support+f"Rolling {rolling}\\{team}.png",bbox_inches="tight")
    plt.close()     # To avoid warning about potential memory
    return print(f"{team} image saved")

In [19]:
# Loop by team
for team in df_by_league.team.unique():
    df_team_pts = df_by_league[ df_by_league["team"]==team ].sort_values("game_num")
    df_team_pts.reset_index(inplace=True)           # Removes game_id as index but keeps as column
    df_team_pts.set_index("game_num",inplace=True) 
    df_team_pts["cum_avg"] = df_team_pts.pts.expanding().mean()    
    df_team_pts["cum_std"] = df_team_pts.pts.expanding().std(ddof=0)   # ddof default = 1, for sample distribution
    df_team_pts["cum_avg_allowed"] = df_team_pts.allowed.expanding().mean()    
    df_team_pts["cum_std_allowed"] = df_team_pts.allowed.expanding().std(ddof=0)   # ddof default = 1, for sample distribution
    df_team_pts["rolling_avg"] = df_team_pts.pts.rolling(rolling).mean()
    df_team_pts["rolling_std"] = df_team_pts.pts.rolling(rolling).std(ddof=0)
    df_team_pts["rolling_avg_allowed"] = df_team_pts.allowed.rolling(rolling).mean()
    df_team_pts["rolling_std_allowed"] = df_team_pts.allowed.rolling(rolling).std(ddof=0)
    save_images_3(df_team_pts,team)


Denver Nuggets image saved
Golden State Warriors image saved
New York Knicks image saved
Charlotte Hornets image saved
Indiana Pacers image saved
Orlando Magic image saved
Toronto Raptors image saved
Brooklyn Nets image saved
Miami Heat image saved
Memphis Grizzlies image saved
Chicago Bulls image saved
Utah Jazz image saved
San Antonio Spurs image saved
LA Clippers image saved
Milwaukee Bucks image saved
Los Angeles Lakers image saved
Boston Celtics image saved
Cleveland Cavaliers image saved
Atlanta Hawks image saved
Dallas Mavericks image saved
Portland Trail Blazers image saved
Sacramento Kings image saved
Washington Wizards image saved
New Orleans Pelicans image saved
Detroit Pistons image saved
Minnesota Timberwolves image saved
Phoenix Suns image saved
Oklahoma City Thunder image saved
Houston Rockets image saved
Philadelphia 76ers image saved


In [20]:
# Clustered/grouped bar chart with labels
# https://matplotlib.org/stable/gallery/lines_bars_and_markers/barchart.html
# Compare mean/std of each team's scoring/allowed against league
    # Ditto for head-to-head, if available
# Create individual df_by_league with pts and allowed by
# df_events is already read in with game_id as index, all game-level data is here
df_events.head()

Unnamed: 0_level_0,game_type,home_id,home_pts,away_id,away_pts,periods,officials,game_time_utc,duration
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
22300061,Regular Season,1610612743,119,1610612747,107,4,Kevin Cutler_Scott Twardoski_James Williams,2023-10-24 23:30:00,138
22300062,Regular Season,1610612744,104,1610612756,108,4,Josh Tiven_Karl Lane_John Butler,2023-10-25 02:00:00,153
22300065,Regular Season,1610612752,104,1610612738,108,4,Ben Taylor_Dedric Taylor_Jacyn Goble,2023-10-25 23:00:00,160
22300063,Regular Season,1610612766,116,1610612737,110,4,Ed Malloy_Lauren Holtkamp_Aaron Smith,2023-10-25 23:00:00,142
22300064,Regular Season,1610612754,143,1610612764,120,4,Sean Wright_Nick Buchert_Dannica Mosher,2023-10-25 23:00:00,128


In [None]:

# cum_std range for each team...when did min and max occur?

In [None]:
# Create chart that shows where team output compares to rest of league
# Lines: median, mean
# Shade: min and max...gradient by frequency?
# For rest of league data, df without current team...reset index...group by game_num...mean and std


In [58]:
# Calculating cumulative means and std's
import numpy as np
df_test = pd.DataFrame()
df_test["vals"] = np.arange(1,11)
df_test["cum_avg"] = df_test.vals.expanding().mean()
df_test["cum_std"] = df_test.vals.expanding().std(ddof=0)   # ddof default = 1, for sample distribution
df_test


Unnamed: 0,vals,cum_avg,cum_std
0,1,1.0,0.0
1,2,1.5,0.5
2,3,2.0,0.816497
3,4,2.5,1.118034
4,5,3.0,1.414214
5,6,3.5,1.707825
6,7,4.0,2.0
7,8,4.5,2.291288
8,9,5.0,2.581989
9,10,5.5,2.872281


In [70]:
# For playoff teams, do outputs increase and allowed decrease? Is the "playoff effect" real?
# Read in events data


df_events = list_dfs[0]
if "game_id" in df_events.columns:
    


Unnamed: 0_level_0,game_type,home_id,home_pts,away_id,away_pts,periods,officials,game_time_utc,duration
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
22301135,Regular Season,1610612766,118,1610612760,121,4,Marc Davis_Andy Nagy_Robert Hussey,2024-04-07 22:00:00,133
22301131,Regular Season,1610612742,147,1610612745,136,5,Curtis Blair_Eric Dalen_CJ Washington,2024-04-07 19:30:00,169
22301133,Regular Season,1610612754,117,1610612748,115,4,Scott Foster_Marat Kogut_Danielle Scott,2024-04-07 21:00:00,148
22301136,Regular Season,1610612753,113,1610612741,98,4,Matt Boland_Kevin Scott_Dedric Taylor,2024-04-07 22:00:00,132
22301138,Regular Season,1610612756,105,1610612740,113,4,Brian Forte_Josh Tiven_Jason Goldenberg,2024-04-07 22:00:00,134


In [68]:
# Query unique team id's from each table, verify: 30 total, all id's match
# Verify each id occurs 82 times in each table for 'Regular Season' type
    # Verify 41 and 41 for events table
# With team id as index, create df with 3 columns: total wins from each table
table_names = ['events','boxscore_players','lineups']

for season in list_seasons:
    table_var = season.replace("-","")      # Season formatted for table query
    list_dfs = []
    df_compare = pd.DataFrame()
    for table in table_names:
        #write_to_log(f"{season} {table}")
        df = query_tables(table,table_var)
        list_dfs.append(df)
        if table == "events":
            team_col = "home_id"
            #write_to_log(f"\t{season} {table} Total Teams: {df[team_col].nunique()}")
            print(f"{season} {table} Total Teams: {df[team_col].nunique()}")
            print(f"{season} {table} Total Teams: {df[team_col].nunique()}")
            
        else:
            team_col = "team_id"
        df_compare[table] = sorted(df[team_col].unique())
    df_compare["check"] = (df_compare["events"] == df_compare["boxscore_players"]) & (df_compare["lineups"] == df_compare["boxscore_players"])   
    print("Total Teams:",len(df_compare))
    print("Incorrect Team ID's:",len(df_compare[ df_compare["check"]==False ]))
    



	2021-22 events Total Teams: 30
	2021-22 events Total Teams: 30
Total Teams: 30
Incorrect Team ID's: 0
	2022-23 events Total Teams: 30
	2022-23 events Total Teams: 30
Total Teams: 30
Incorrect Team ID's: 0
	2023-24 events Total Teams: 30
	2023-24 events Total Teams: 30
Total Teams: 30
Incorrect Team ID's: 0


In [62]:
# Stat Builder
# Read in players/lineups data
# Iterate through event game_id's
# Reduce data to game_id, team_id, period, <stat>

df_raw = list_dfs[1]                        # Data from players table
periods = df_raw["period"].max()            # Number of periods to iterate through
df_pts = pd.DataFrame()                     # Empty dataframe to union
list_teams = df_raw["team_id"].unique()

for team in list_teams[:3]:             # Iterate through teams
    df = df_raw[["game_id","team_id","period","pts"]]
    df = df[ df["team_id"]==team ]                  # Returns all player-level lines for a team
    df = df.groupby(["game_id","period"]).sum()     # Groups to period level by game

    for period in range(1,periods+1):               # Create columns and team sums
        print(df.index)

        
        
        #df_temp = df[ df["period"]==period ][["team_id","pts"]]
        #df_temp.rename(columns={"pts":f"qtr_{period}"},inplace=True)
        #df_temp.set_index("team_id",inplace=True)
        




MultiIndex([('0022300004', 1),
            ('0022300004', 2),
            ('0022300004', 3),
            ('0022300004', 4),
            ('0022300033', 1),
            ('0022300033', 2),
            ('0022300033', 3),
            ('0022300033', 4),
            ('0022300046', 1),
            ('0022300046', 2),
            ...
            ('0022301223', 3),
            ('0022301223', 4),
            ('0052300111', 1),
            ('0052300111', 2),
            ('0052300111', 3),
            ('0052300111', 4),
            ('0052300201', 1),
            ('0052300201', 2),
            ('0052300201', 3),
            ('0052300201', 4)],
           names=['game_id', 'period'], length=348)
MultiIndex([('0022300004', 1),
            ('0022300004', 2),
            ('0022300004', 3),
            ('0022300004', 4),
            ('0022300033', 1),
            ('0022300033', 2),
            ('0022300033', 3),
            ('0022300033', 4),
            ('0022300046', 1),
            ('0022300046', 2),
 

In [6]:
import mathletics_utilities as mu
mu.record_builder("2024-25")

2024-25 Eastern
                     wins  losses  win_%
team_name                               
Cleveland Cavaliers    29       4   0.88
Boston Celtics         25       9   0.74
New York Knicks        24      12   0.67
Orlando Magic          21      15   0.58
Milwaukee Bucks        18      16   0.53
Atlanta Hawks          18      18   0.50
Detroit Pistons        17      17   0.50
Miami Heat             16      16   0.50
Indiana Pacers         17      18   0.49
Philadelphia 76ers     14      18   0.44
Chicago Bulls          15      19   0.44
Brooklyn Nets          13      21   0.38
Toronto Raptors         8      26   0.24
Charlotte Hornets       7      25   0.22
Washington Wizards      6      25   0.19 

2024-25 Western
                        wins  losses  win_%
team_name                                  
Oklahoma City Thunder     29       6   0.83
Houston Rockets           22      12   0.65
Memphis Grizzlies         22      13   0.63
Los Angeles Lakers        20      13   0.61
LA Cl