In [1]:
import os
import pandas as pd
import sklearn as sk
from sklearn.linear_model import LogisticRegressionCV
from database import engine

In [2]:
query = """
select
    team_season, srs, ortg, drtg, true_shooting_percentage, average_age, margin_of_victory
from
    stats
where
    season in ('2011','2012','2013','2014','2015','2016','2017','2018','2019','2021','2022');
    """

team_stats = pd.read_sql_query(query, engine)

In [3]:
team_abbreviations = pd.read_csv(os.path.join("..", "data", "clean", "team_abbreviations.csv"))

In [4]:
game_results = pd.read_csv(os.path.join("..", "data", "clean", "game_results.csv"))
game_results = game_results.assign(home_win=lambda x: x.home_score > x.away_score)
game_results = game_results.loc[(game_results['year'] > 2010) & (game_results['year'] != 2020)] 

In [5]:
team_abbreviations = team_abbreviations.set_index(["Team"])
game_results = game_results.set_index(["home_team"])
game_results = game_results.assign(home_abbreviation = team_abbreviations["Abbreviation"])

game_results = game_results.reset_index()
game_results = game_results.set_index(["away_team"])
game_results = game_results.assign(away_abbreviation = team_abbreviations["Abbreviation"])
game_results = game_results.reset_index()

game_results['home_teamID'] = game_results['home_abbreviation'] + "-" + game_results["year"].astype(str)
game_results['away_teamID'] = game_results['away_abbreviation'] + "-" + game_results["year"].astype(str)

In [6]:
game_results = game_results.set_index(["home_teamID"])
team_stats = team_stats.set_index(["team_season"])
game_results = game_results.assign(
    home_srs = team_stats["srs"],
    home_ortg = team_stats["ortg"],
    home_drtg = team_stats["drtg"],
    home_true_shooting_percentage = team_stats["true_shooting_percentage"],
    home_average_age = team_stats["average_age"],
    home_margin_of_victory = team_stats["margin_of_victory"]
)

game_results = game_results.reset_index()
game_results = game_results.set_index(["away_teamID"])
game_results = game_results.assign(
    away_srs = team_stats["srs"],
    away_ortg = team_stats["ortg"],
    away_drtg = team_stats["drtg"],
    away_true_shooting_percentage = team_stats["true_shooting_percentage"],
    away_average_age = team_stats["average_age"],
    away_margin_of_victory = team_stats["margin_of_victory"]
)
game_results = game_results.reset_index()

game_results = game_results.assign(avg_age_diff=lambda x: x.home_average_age - x.away_average_age)

game_results = game_results.drop(["away_team", "home_team", "year", "round", "game", "home_score", "away_score", "home_abbreviation", "away_abbreviation", "home_average_age", "away_average_age"], axis = 1)
game_results

Unnamed: 0,away_teamID,home_teamID,home_win,home_srs,home_ortg,home_drtg,home_true_shooting_percentage,home_margin_of_victory,away_srs,away_ortg,away_drtg,away_true_shooting_percentage,away_margin_of_victory,avg_age_diff
0,MIL-2021,PHX-2021,False,5.67,117.2,111.3,0.597,5.82,5.57,117.2,111.4,0.593,5.89,-1.5
1,MIL-2021,PHX-2021,False,5.67,117.2,111.3,0.597,5.82,5.57,117.2,111.4,0.593,5.89,-1.5
2,PHX-2021,MIL-2021,False,5.57,117.2,111.4,0.593,5.89,5.67,117.2,111.3,0.597,5.82,1.5
3,PHX-2021,MIL-2021,False,5.57,117.2,111.4,0.593,5.89,5.67,117.2,111.3,0.597,5.82,1.5
4,MIL-2021,PHX-2021,True,5.67,117.2,111.3,0.597,5.82,5.57,117.2,111.4,0.593,5.89,-1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,DEN-2011,OKC-2011,True,3.81,111.2,107.2,0.561,3.79,4.81,112.3,107.3,0.574,4.76,-3.6
830,DEN-2011,OKC-2011,True,3.81,111.2,107.2,0.561,3.79,4.81,112.3,107.3,0.574,4.76,-3.6
831,OKC-2011,DEN-2011,False,4.81,112.3,107.3,0.574,4.76,3.81,111.2,107.2,0.561,3.79,3.6
832,OKC-2011,DEN-2011,True,4.81,112.3,107.3,0.574,4.76,3.81,111.2,107.2,0.561,3.79,3.6


In [19]:
#game_results.iloc[:, 3:14]
#game_results["home_win"].astype(int)
clf = LogisticRegressionCV(cv = 10, solver = "sag", max_iter = 10000).fit(game_results.iloc[:, 3:14], game_results["home_win"].astype(int))