In [128]:
import pandas as pd
import numpy as np
import datetime

In [138]:
# Read in 2024 schedule
df = pd.read_csv("data/2024_schedule.csv")
df

Unnamed: 0,date,time,home,away
0,2024-03-16,13:00:00,KCC,POR
1,2024-03-16,16:00:00,LOU,ORL
2,2024-03-16,19:00:00,NC,HOU
3,2024-03-16,19:30:00,UTA,CHI
4,2024-03-16,22:00:00,LA,BFC
...,...,...,...,...
177,2024-11-02,17:00:00,ORL,RGN
178,2024-11-02,19:30:00,NC,WAS
179,2024-11-02,21:30:00,HOU,BFC
180,2024-11-03,15:00:00,CHI,KCC


In [139]:
# Convert times to local time
TZ_ADJUST = {
    "KCC": -1,
    "LOU": 0,
    "NC": 0,
    "UTA": -2,
    "LA": -3,
    "RGN": -3,
    "ORL": 0,
    "CHI": -1,
    "WAS": 0,
    "HOU": -1,
    "SD": -3,
    "POR": -3,
    "BFC": -3,
    "NJNY": 0,
}
# Add the TZ adjustment and then subtract 1 to get the local standard time.
# The whole season is in daylight time.
df["local_std_time"] = df.apply(
    lambda row: row["time"] + pd.Timedelta(hours=TZ_ADJUST[row["home"]] - 1), axis=1
)
df["local_std_time"] = df["local_std_time"].astype(str).str.split("0 days ").str[-1]
df["day"] = pd.to_datetime(df["date"]).dt.strftime("%m-%d")
df

Unnamed: 0,date,time,home,away,local_std_time,day
0,2024-03-16,13:00:00,KCC,POR,11:00:00,03-16
1,2024-03-16,16:00:00,LOU,ORL,15:00:00,03-16
2,2024-03-16,19:00:00,NC,HOU,18:00:00,03-16
3,2024-03-16,19:30:00,UTA,CHI,16:30:00,03-16
4,2024-03-16,22:00:00,LA,BFC,18:00:00,03-16
...,...,...,...,...,...,...
177,2024-11-02,17:00:00,ORL,RGN,16:00:00,11-02
178,2024-11-02,19:30:00,NC,WAS,18:30:00,11-02
179,2024-11-02,21:30:00,HOU,BFC,19:30:00,11-02
180,2024-11-03,15:00:00,CHI,KCC,13:00:00,11-03


In [148]:
# Map from home team to station ID
FILES = {
    "KCC": 72446603929,
    "LOU": 72423093821,
    "NC": 72306013722,
    "UTA": 72572024127,
    "LA": 72295023174,
    "RGN": 72793524234,
    "ORL": 72205012815,
    "CHI": 72530094846,
    "WAS": 72405013743,
    "HOU": 72244012918,
    "SD": 72290023188,
    "POR": 72698024229,
    "BFC": 72494523293,
    "NJNY": 72502014734,
}

In [149]:
# Read in data for each station
# https://www.ncei.noaa.gov/metadata/geoportal/rest/metadata/item/gov.noaa.ncdc:C00684/html
weather = {}
for team in FILES:
    if FILES[team] > 0:
        df_s = pd.read_csv("data/" + str(FILES[team]) + ".csv", low_memory=False)
        df_s = df_s[["STATION", "DATE", "HourlyDryBulbTemperature"]]
        df_s = df_s.rename(
            columns={
                "STATION": "station",
                "DATE": "datetime",
                "HourlyDryBulbTemperature": "temp",
            }
        )
        df_s = df_s.dropna()
        df_s["datetime"] = pd.to_datetime(df_s["datetime"])
        df_s["year"] = df_s["datetime"].dt.year
        df_s["day"] = df_s["datetime"].dt.strftime("%m-%d")

        df_s["time"] = df_s["datetime"].dt.time
        df_s["time"] = df_s["time"].apply(
            lambda x: datetime.timedelta(
                hours=x.hour,
                minutes=x.minute,
                seconds=x.second,
                microseconds=x.microsecond,
            )
        )
        weather[team] = df_s

In [162]:
# Function to find nearest temperature measurement
def find_nearest_temp(team, year, day, time):
    if team not in weather:
        return None
    # Get team's data
    data = weather[team]
    # Filter to the correct year
    data = data[data["year"] == year]
    # Filter to the correct date
    data = data[data["day"] == day]
    time = datetime.datetime.strptime(time, "%H:%M:%S").time()
    time = datetime.timedelta(
        hours=time.hour,
        minutes=time.minute,
        seconds=time.second,
        microseconds=time.microsecond,
    )
    data["diff"] = np.abs((data["time"] - time).dt.total_seconds())
    if len(data.index) == 0:
        print("No data for: " + team + " " + str(year) + " " + day)
        return None
    idx = data[["diff"]].idxmin(axis=0)
    return int(data.loc[idx["diff"]]["temp"].replace("s", ""))


# find_nearest_temp("RGN", 2023, "03-17", "15:00:00")

In [163]:
# Function to calculate the average temp over the last 5 years
def find_average_temp(team, day, time):
    try:
        temps = [find_nearest_temp(team, year, day, time) for year in range(2019, 2024)]
        temps = [t for t in temps if t != None]
        if len(temps) == 0:
            return None
        return sum(temps) / len(temps)
    except Exception as e:
        print("Error ", team, day, time)
        print(e)
        return None
        


# find_average_temp("RGN", "03-17", "15:00:00")

In [164]:
# Calculate average temp over last 5 years for each game
df["avg_temp"] = df.apply(
    lambda row: find_average_temp(row["home"], row["day"], row["local_std_time"]), axis=1
)
df

Unnamed: 0,date,time,home,away,local_std_time,day,avg_temp
0,2024-03-16,13:00:00,KCC,POR,11:00:00,03-16,52.4
1,2024-03-16,16:00:00,LOU,ORL,15:00:00,03-16,61.2
2,2024-03-16,19:00:00,NC,HOU,18:00:00,03-16,54.4
3,2024-03-16,19:30:00,UTA,CHI,16:30:00,03-16,52.2
4,2024-03-16,22:00:00,LA,BFC,18:00:00,03-16,59.0
...,...,...,...,...,...,...,...
177,2024-11-02,17:00:00,ORL,RGN,16:00:00,11-02,80.2
178,2024-11-02,19:30:00,NC,WAS,18:30:00,11-02,49.4
179,2024-11-02,21:30:00,HOU,BFC,19:30:00,11-02,64.6
180,2024-11-03,15:00:00,CHI,KCC,13:00:00,11-03,58.2


In [165]:
# Write data to file
df.to_csv("data/matches_with_temp.csv", index=False)

In [166]:
# Summary stats: low temp
df.loc[df["avg_temp"].idxmin()]

date              2024-03-22
time                21:30:00
home                     UTA
away                      NC
local_std_time      18:30:00
day                    03-22
avg_temp                43.8
Name: 7, dtype: object

In [167]:
# Summary stats: high temp
df.loc[df["avg_temp"].idxmax()]

date              2024-08-23
time                20:00:00
home                     HOU
away                     ORL
local_std_time      18:00:00
day                    08-23
avg_temp                89.8
Name: 112, dtype: object