In [None]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestClassifier

games = pd.read_csv("NHL_Schedule.csv")
teams_2122 = pd.read_csv("teams-2021:22.csv")
teams_2223 = pd.read_csv("teams-2022:23.csv")
teams_2324 = pd.read_csv('teams-2023:24.csv')
teams_2425 = pd.read_csv('teams-2024:25.csv')
games["date"] = pd.to_datetime(games["Date"])

#the only sitatuion i want from the other team files is all
teams_2122 = teams_2122[teams_2122["situation"] == "all"].copy()
teams_2324 = teams_2324[teams_2324['situation'] == 'all'].copy()
teams_2425 = teams_2425[teams_2425['situation'] == 'all'].copy()
teams_2223 = teams_2223[teams_2223['situation'] == 'all'].copy()

#replace arizona with utah
teams_2122["team"] = teams_2122["team"].replace("ARI", "UTA")
teams_2223["team"] = teams_2223["team"].replace("ARI", "UTA")
teams_2223['name'] = teams_2223['name'].replace('ARI', 'UTA')
teams_2223['team.1'] = teams_2223['team.1'].replace('ARI', 'UTA')

teams_2324["team"] = teams_2324["team"].replace("ARI", "UTA")
teams_2324['name'] = teams_2324['name'].replace('ARI', 'UTA')
teams_2324['team.1'] = teams_2324['team.1'].replace('ARI', 'UTA')

teams_2122["season"] = teams_2122["season"].replace(2021, "2021-22")
teams_2223['season'] = teams_2223['season'].replace(2022, '2022-23')
teams_2324['season'] = teams_2324['season'].replace(2023, '2023-24')
teams_2425['season'] = teams_2425['season'].replace(2024, '2024-25')


#set all the columns we actually want
cols = [
    'team',
    'season',
    'xGoalsPercentage',
    'xGoalsFor',
    'lowDangerxGoalsFor',
    'mediumDangerxGoalsFor',
    'highDangerxGoalsFor',
    'xGoalsAgainst',
    'lowDangerxGoalsAgainst',
    'mediumDangerxGoalsAgainst',
    'highDangerxGoalsAgainst',
]

teams_2122 = teams_2122[cols]
teams_2223 = teams_2223[cols]
teams_2324 = teams_2324[cols]
teams_2425 = teams_2425[cols]

#combine both 
team_stats = pd.concat([teams_2324, teams_2425], ignore_index=True)
team_stats = pd.concat([team_stats, teams_2223],ignore_index=True)
team_stats = pd.concat([team_stats, teams_2122], ignore_index=True)

team_stats = team_stats.rename(columns={"team": "Team"})

games = games[[
    "date",
    "AwayTeam",
    "HomeTeam",
    "AwayScore",
    "HomeScore"
]]

games["home_win"] = (games["HomeScore"] > games["AwayScore"]).astype(int)

home = games.copy()
home["Team"] = home["HomeTeam"]
home["Opponent"] = home["AwayTeam"]
home["IsHome"] = 1
home["TeamScore"] = home["HomeScore"]
home["OppScore"] = home["AwayScore"]
home["Win"] = (home["TeamScore"] > home["OppScore"]).astype(int)

away = games.copy()
away["Team"] = away["AwayTeam"]        
away["Opponent"] = away["HomeTeam"]
away["IsHome"] = 0
away["TeamScore"] = away["AwayScore"]
away["OppScore"] = away["HomeScore"]
away["Win"] = (away["TeamScore"] > away["OppScore"]).astype(int)


team_games = pd.concat([home, away], ignore_index=True)

#since utah replced arizona in 2024-25 season, i repalced ARi with Utah as they are pretty much the same team when they moved
team_games["Team"] = team_games["Team"].replace("ARI", "UTA")
team_games["Opponent"] = team_games["Opponent"].replace("ARI", "UTA")

team_games = team_games[[
    "date", "Team", "Opponent", "IsHome", "TeamScore", "OppScore", "Win"
]].sort_values(["Team", "date"])

team_games = team_games[team_games["date"] >= "2021-10-01"]

start_year = np.where(
    team_games["date"].dt.month >= 10,
    team_games["date"].dt.year,
    team_games["date"].dt.year - 1
)

team_games["season"] = (
    start_year.astype(str)
    + "-"
    + ((start_year + 1) % 100).astype(int).astype(str).astype(object)
)
team_games["season"] = team_games["season"].str.replace(r"-(\d)$", r"-0\1", regex=True)


team_games = team_games.merge(
    team_stats,
    how="left",
    on=["Team", "season"]
)

opp_stats = team_stats.rename(columns={
    "Team": "Opponent",
    "xGoalsPercentage": "Opp_xGoalsPercentage",
    "xGoalsFor": "Opp_xGoalsFor",
    "lowDangerxGoalsFor": "Opp_lowDangerxGoalsFor",
    "mediumDangerxGoalsFor": "Opp_mediumDangerxGoalsFor",
    "highDangerxGoalsFor": "Opp_highDangerxGoalsFor",
    "xGoalsAgainst": "Opp_xGoalsAgainst",
    "lowDangerxGoalsAgainst": "Opp_lowDangerxGoalsAgainst",
    "mediumDangerxGoalsAgainst": "Opp_mediumDangerxGoalsAgainst",
    "highDangerxGoalsAgainst": "Opp_highDangerxGoalsAgainst",
})

team_games = team_games.merge(
    opp_stats,
    how="left",
    on=["Opponent", "season"]
)

team_games = team_games.sort_values(["Team", "date"])
team_games["Win_10"] = (
    team_games.groupby("Team")["Win"]
    .transform(lambda s: s.shift(1).rolling(10).mean())
)

# Build opponent rolling win% by matching the "other row" of the same game
opp_roll = team_games[["date", "Team", "Opponent", "IsHome", "Win_10"]].copy()

# Flip the perspective: opponent row has Team/Opponent swapped and IsHome flipped
opp_roll = opp_roll.rename(columns={
    "Team": "Opponent",
    "Opponent": "Team",
    "Win_10": "Opp_Win_10"
})
opp_roll["IsHome"] = 1 - opp_roll["IsHome"]

# Merge back so each row gets opponent's Win_10
team_games = team_games.merge(
    opp_roll,
    how="left",
    on=["date", "Team", "Opponent", "IsHome"]
)



features = [
    "IsHome",
    "Win_10",
    "Opp_Win_10",
    "xGoalsPercentage",
    "xGoalsFor",
    'lowDangerxGoalsFor',
    'mediumDangerxGoalsFor',
    'highDangerxGoalsFor',
    "xGoalsAgainst",
    'lowDangerxGoalsAgainst',
    'mediumDangerxGoalsAgainst',
    'highDangerxGoalsAgainst',
    

    "Opp_xGoalsPercentage",
    "Opp_xGoalsFor",
    "Opp_lowDangerxGoalsFor",
    "Opp_mediumDangerxGoalsFor",
    "Opp_highDangerxGoalsFor",
    "Opp_xGoalsAgainst",
    "Opp_lowDangerxGoalsAgainst",
    "Opp_mediumDangerxGoalsAgainst",
    "Opp_highDangerxGoalsAgainst"
]


model_df = team_games.dropna(subset=features + ["Win"]).copy()

train = model_df[model_df['date'] < '2024-07-01']
test = model_df[model_df['date'] >= '2024-07-01']


rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1
)

rf.fit(train[features], train["Win"])

proba = rf.predict_proba(test[features])[:, 1]
test = test.copy()
test["p_win"] = proba


test[["date", "Team", "Opponent", "IsHome", "Win", "p_win"]] \
    .sort_values("p_win", ascending=False) \
    .head(10)


from sklearn.metrics import accuracy_score, log_loss

pred = (test["p_win"] >= 0.5).astype(int)

print("Accuracy:", accuracy_score(test["Win"], pred))
print("Log loss:", log_loss(test["Win"], test["p_win"]))

