<a href="https://colab.research.google.com/github/brandonowens24/Potential_NBA_Model/blob/main/NBA_From_Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load In DataFrames:
* officials = The officials officiating specific games.
* box_stats = Box score statistics for specific games.
* inactive = Players inactive for specific games.
* team_stats = Team statistics for the year.
* off_stats = Officials statistics for their careers.
* player_stats = Players statistics for the year.

In [None]:
import pandas as pd
from datetime import datetime
import csv
import time
from tqdm import tqdm

### Prepare Officials Dataframe


In [None]:
officials = pd.read_csv('officials.csv')

In [None]:
officials["NAME"] = officials.first_name + ' ' + officials.last_name
officials = officials.drop(["jersey_num", "first_name", "last_name"], axis=1).sort_values("game_id").reset_index(drop=True)

In [None]:
# Who officiated each game? (crews of 3)
officials.head()

Unnamed: 0,game_id,official_id,NAME
0,10500001,1193,Michael Smith
1,10500001,1155,JB DeRosa
2,10500001,2713,Lorenzo Bronson
3,10500002,2529,Tony Brown
4,10500002,1153,Joe Crawford


### Prepare Simple Box Statistics Dataframe


In [None]:
box_stats = pd.read_csv('game.csv', parse_dates=['game_date'])

In [None]:
box_stats = box_stats.loc[(box_stats.game_date.dt.year > 2002) & ((box_stats.season_type == "Regular_Season") | (box_stats.season_type == "Playoffs")), ]
box_stats = box_stats.drop(["video_available_away", "video_available_home", "team_name_home"], axis=1).sort_values("game_id").reset_index(drop=True)

In [None]:
# Get Binary Values
box_stats[["wl_home", "wl_away", "season_type"]] = box_stats.loc[:, ["wl_home", "wl_away", "season_type"]].replace({"W": 1, "L": 0, "Playoffs": 1, "Regular Season": 0})

In [None]:
# Create Year Column
box_stats['year'] = box_stats['game_date'].dt.year.astype(int)

In [None]:
# Normal Game Stats
box_stats.head()

Unnamed: 0,season_id,team_id_home,team_abbreviation_home,game_id,game_date,matchup_home,wl_home,min,fgm_home,fga_home,...,reb_away,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,season_type,year
0,42002,1610612765,DET,40200101,2003-04-20,DET vs. ORL,0,240,24.0,76.0,...,44.0,18.0,3.0,6.0,11.0,35.0,99.0,5,1,2003
1,42002,1610612765,DET,40200102,2003-04-23,DET vs. ORL,1,240,31.0,73.0,...,30.0,15.0,7.0,1.0,15.0,29.0,77.0,-12,1,2003
2,42002,1610612753,ORL,40200103,2003-04-25,ORL vs. DET,1,240,33.0,69.0,...,40.0,19.0,6.0,5.0,16.0,26.0,80.0,-9,1,2003
3,42002,1610612753,ORL,40200104,2003-04-27,ORL vs. DET,1,240,32.0,64.0,...,43.0,16.0,11.0,3.0,16.0,28.0,92.0,-8,1,2003
4,42002,1610612765,DET,40200105,2003-04-30,DET vs. ORL,1,240,35.0,80.0,...,43.0,10.0,5.0,4.0,18.0,31.0,67.0,-31,1,2003


### Load In Inactive Players Dataframe


In [None]:
inactive = pd.read_csv("inactive_players.csv")

In [None]:
inactive["NAME"] = inactive.first_name + ' ' + inactive.last_name
inactive["prev_season"] = inactive['game_id'].map(box_stats.set_index('game_id')['year'] - 1)
inactive = inactive.drop(["player_id", "jersey_num", "team_city", "team_name", "first_name", "last_name"], axis=1).sort_values("game_id")
inactive = inactive.loc[inactive.game_id.isin(inactive.game_id), ].reset_index(drop=True)

In [None]:
inactive.head()

Unnamed: 0,game_id,team_id,team_abbreviation,NAME,prev_season
0,10500008,1610612739,CLE,LeBron James,
1,10500008,1610612738,BOS,Tony Allen,
2,10500008,1610612738,BOS,Al Jefferson,
3,10500008,1610612739,CLE,Anderson Varejao,
4,10500008,1610612739,CLE,Ira Newble,


### Load In Referee Statistics


In [None]:
off_stats = pd.read_csv("refereestats.csv")

In [None]:
off_stats.head()

Unnamed: 0,NAME,ref_G,ref_FGA,ref_FTA,ref_PF,ref_PTS,ref_FGA_pgrel,ref_FTA_pgrel,ref_PF_pgrel,ref_PTS_pgrel,...,ref_away_win_loss,ref_away_FGA,ref_away_FTA,ref_away_PF,ref_away_PTS,ref_win_loss_hvrel,ref_FGA_hvrel,ref_FTA_hvrel,ref_PF_hvrel,ref_PTS_hvrel
0,Ray Acosta,374,175.329679,46.354278,40.128342,223.222995,-1.223262,0.828877,-0.061765,-0.832086,...,0.425206,87.189305,22.967914,20.348396,110.566577,0.012414,0.993583,-0.059091,-0.289305,-0.034492
1,Brandon Adair,290,177.658965,43.567586,38.201724,222.049655,0.806552,-1.981724,-1.913793,-2.783448,...,0.468893,88.904483,21.813793,19.002414,110.835862,-0.071038,-0.136207,-0.571034,0.463103,-1.618966
2,Brent Barnaky,818,171.22555,44.861247,39.400122,212.32555,-0.159413,-1.003423,-0.879707,-0.244988,...,0.436383,85.837653,21.932396,19.962958,105.176406,-0.03124,-0.387408,0.256968,-0.00489,-0.505134
3,Curtis Blair,896,170.271987,45.69096,40.220536,210.513281,-0.104911,-0.522656,-0.227679,-0.693638,...,0.444225,85.201228,22.291295,20.355022,104.408259,-0.051198,-0.109933,0.290179,0.048326,-0.80904
4,Matt Boland,1192,168.080453,47.682886,41.969966,205.683054,0.591359,0.110738,0.536493,-0.783473,...,0.428637,83.975587,23.614933,21.150084,101.792198,-0.033086,0.126594,-0.507634,0.328859,-0.639849


### Load In Player Statistics

In [None]:
player_stats = pd.read_csv("player_yearly_stats.csv")

In [None]:
player_stats.head()

Unnamed: 0,player_name,season_year,GP,PPG,MPG,FGM,FGA,FG_percentage,threeP,threePA,...,DRB_percentage,AST_percentage,STL_percentage,BLK_percentage,USG,PPR,ORtg,DRtg,PER,WS
0,Lorenzen Wright,2002,43,12.0,29.1,5.2,11.3,0.459,0.0,0.0,...,25.5,6.6,1.3,1.2,22.2,-3.4,96.8,104.5,14.8,1.4
1,Metta World Peace,2002,55,13.2,29.8,4.9,11.6,0.423,1.0,3.1,...,14.3,14.3,4.6,1.8,23.3,-2.0,97.4,97.8,15.9,3.3
2,Loren Woods,2002,60,1.8,8.6,0.6,1.6,0.344,0.0,0.0,...,16.8,5.7,1.7,4.9,13.8,-4.1,91.7,100.5,10.0,0.6
3,Kevin Willis,2002,52,6.1,16.7,2.4,5.5,0.44,0.0,0.0,...,26.1,2.9,1.6,1.8,19.4,-3.8,103.7,103.9,16.2,1.8
4,Corliss Williamson,2002,78,13.6,21.8,5.3,10.3,0.51,0.0,0.1,...,14.0,11.5,1.5,1.1,29.8,-4.4,106.8,104.3,20.1,5.0


### Load In Team Statistics

In [None]:
team_stats = pd.read_csv('team_averages.csv')

In [None]:
team_stats.fillna(0, inplace=True)
team_stats['sep_id'] = team_stats['sep_id'].astype(str).apply(lambda x: x[:-2])
team_stats.head()

Unnamed: 0,sep_id,season_year,W,L,MOV,SOS,SRS,ORtg,DRtg,Pace,...,FTp,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1610612759,2016,67,15,10.63,-0.36,10.28,110.3,99.0,93.8,...,0.803,9.4,34.5,43.9,24.5,8.3,5.9,13.1,17.5,103.5
1,1610612744,2016,73,9,10.76,-0.38,10.38,114.5,103.8,99.3,...,0.763,10.0,36.2,46.2,28.9,8.4,6.1,15.2,20.7,114.9
2,1610612760,2016,55,27,7.28,-0.19,7.09,113.1,105.6,96.7,...,0.782,13.1,35.6,48.6,23.0,7.4,5.9,15.9,20.6,110.2
3,1610612739,2016,57,25,6.0,-0.55,5.45,110.9,104.5,93.3,...,0.748,10.6,33.9,44.5,22.7,6.7,3.9,13.6,20.3,104.3
4,1610612761,2016,56,26,4.5,-0.42,4.08,110.0,105.2,92.9,...,0.777,10.2,33.2,43.4,18.7,7.8,5.5,13.1,19.6,102.7


## Merge and Feature Engineer Relevant DataFrames

Creates an accumulation of all of these dataframes for future feature analysis

### Previous Year


In [None]:
# Game Stats = Box Stats + Advanced Stats
box_stats['season_year'] = pd.DatetimeIndex(box_stats['game_date']).year
box_stats['prev_season'] = box_stats['season_year'] - 1

In [None]:
box_stats.head()

Unnamed: 0,season_id,team_id_home,team_abbreviation_home,game_id,game_date,matchup_home,wl_home,min,fgm_home,fga_home,...,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,season_type,year,season_year,prev_season
0,42002,1610612765,DET,40200101,2003-04-20,DET vs. ORL,0,240,24.0,76.0,...,3.0,6.0,11.0,35.0,99.0,5,1,2003,2003,2002
1,42002,1610612765,DET,40200102,2003-04-23,DET vs. ORL,1,240,31.0,73.0,...,7.0,1.0,15.0,29.0,77.0,-12,1,2003,2003,2002
2,42002,1610612753,ORL,40200103,2003-04-25,ORL vs. DET,1,240,33.0,69.0,...,6.0,5.0,16.0,26.0,80.0,-9,1,2003,2003,2002
3,42002,1610612753,ORL,40200104,2003-04-27,ORL vs. DET,1,240,32.0,64.0,...,11.0,3.0,16.0,28.0,92.0,-8,1,2003,2003,2002
4,42002,1610612765,DET,40200105,2003-04-30,DET vs. ORL,1,240,35.0,80.0,...,5.0,4.0,18.0,31.0,67.0,-31,1,2003,2003,2002


Created a previous year column to be able to match team statistics and player statistics from the previous year to a current game. This assumes that players and teams generall perform similarly YoY.

### Combine Referee Identity and Referee Stats on Game

In [None]:
# Refs = Referee Identifiers + Referee Stats
refs = pd.merge(officials, off_stats, on="NAME", how="left").dropna()
refs = refs.groupby('game_id').mean().reset_index()

  refs = refs.groupby('game_id').mean().reset_index()


In [None]:
refs.head()

Unnamed: 0,game_id,official_id,ref_G,ref_FGA,ref_FTA,ref_PF,ref_PTS,ref_FGA_pgrel,ref_FTA_pgrel,ref_PF_pgrel,...,ref_away_win_loss,ref_away_FGA,ref_away_FTA,ref_away_PF,ref_away_PTS,ref_win_loss_hvrel,ref_FGA_hvrel,ref_FTA_hvrel,ref_PF_hvrel,ref_PTS_hvrel
0,10500001,1193.0,1797.0,165.016639,49.777351,43.401391,202.465721,-0.55409,1.19616,0.989482,...,0.404076,82.332721,24.292321,22.049917,99.708459,0.006966,0.188425,0.240122,-0.099221,0.143127
1,10500002,1625.0,1553.333333,165.669649,48.321966,42.458512,200.799483,0.295625,-0.825892,-0.236416,...,0.405142,82.750471,23.712104,21.528958,98.913147,-0.00659,0.023199,-0.176892,0.092712,-0.11955
2,10500003,1443.666667,1280.0,163.936885,49.794179,43.226179,198.077441,0.410108,0.034027,-0.155564,...,0.394621,81.881965,24.434087,21.872502,97.380753,0.008896,-0.045084,-0.115308,0.190422,0.136504
3,10500005,1498.666667,1012.666667,164.923396,49.955796,43.036418,199.978056,0.481791,0.131592,-0.209996,...,0.387841,82.396183,24.414869,21.848146,98.230515,0.020522,-0.006811,0.004255,0.069622,0.297984
4,10500007,2022.5,997.0,161.334663,52.524861,45.309974,199.693431,-1.127482,2.125915,1.567371,...,0.395743,80.60348,25.641614,23.097302,98.205797,0.016259,-0.046694,0.180301,-0.138691,0.282445


Combining Officials with their career statistics to be able to see how they impact games based on their previous histories.

### Combine Injured Players with Injured Statistcs

In [None]:
injured = pd.merge(inactive, player_stats, left_on=["NAME", "prev_season"], right_on=["player_name", 'season_year'])

In [None]:
# Some values got passed as objects
injured.replace('-', 0, inplace=True)
object_columns = ["ORB_percentage", "DRB_percentage", "AST_percentage", "STL_percentage", "BLK_percentage", "USG", "PPR"]
injured[object_columns] = injured[object_columns].astype(float)





injured = injured.groupby(['game_id', 'team_abbreviation']).agg(hurt_GP=('GP', 'sum'), hurt_PPG=('PPG', 'sum'), hurt_MPG=('MPG', 'sum'), hurt_FGM=('FGM', 'sum'),
                                             hurt_FGA=('FGA', 'sum'), hurt_FGp=('FG_percentage', 'mean'), hurt_threeP=('threeP', 'sum'), hurt_threePA=('threePA', 'sum'),
                                             hurt_three_percentage=('three_percentage', 'mean'), hurt_FT=('FT', 'sum'), hurt_FTA=('FTA', 'sum'),
                                             hurt_FT_percentage=('FT_percentage', 'mean'), hurt_ORB=('ORB', 'sum'), hurt_DRB=('DRB', 'sum'),
                                             hurt_RPG=('RPG', 'sum'), hurt_AST=('AST', 'sum'), hurt_STL=('STL', 'sum'), hurt_BLK=('BLK', 'sum'),
                                             hurt_TOV=('TOV', 'sum'), hurt_PF=('PF', 'sum'), hurt_eFG=('eFG', 'mean'), hurt_ORBp=('ORB_percentage', 'mean'),
                                             hurt_DRBp=('DRB_percentage', 'mean'), hurt_ASTp=('AST_percentage', 'mean'), hurt_STLp=('STL_percentage', 'mean'),
                                             hurt_BLKp=('BLK_percentage', 'mean'), hurt_USG=('USG', 'sum'), hurt_PPR=('PPR', 'mean'),
                                             hurt_ORtg=('ORtg', 'mean'), hurt_DRtg=('DRtg', 'mean'), hurt_PER=('PER', 'mean'), hurt_WS=('WS', 'sum')).reset_index().fillna(0)

In [None]:
injured.head()

Unnamed: 0,game_id,team_abbreviation,hurt_GP,hurt_PPG,hurt_MPG,hurt_FGM,hurt_FGA,hurt_FGp,hurt_threeP,hurt_threePA,...,hurt_DRBp,hurt_ASTp,hurt_STLp,hurt_BLKp,hurt_USG,hurt_PPR,hurt_ORtg,hurt_DRtg,hurt_PER,hurt_WS
0,40400233,SEA,77,12.0,30.1,4.5,10.5,0.425,1.8,4.9,...,15.1,10.4,1.8,1.4,19.5,-0.6,106.6,107.9,15.2,4.2
1,40400234,SEA,157,29.8,66.7,11.2,25.9,0.43,3.6,9.7,...,15.2,10.5,1.8,1.4,42.9,-0.6,107.05,107.95,16.35,10.6
2,40400236,SEA,157,29.8,66.7,11.2,25.9,0.43,3.6,9.7,...,15.2,10.5,1.8,1.4,42.9,-0.6,107.05,107.95,16.35,10.6
3,40600101,DET,39,8.9,20.3,3.2,5.1,0.612,0.7,1.0,...,7.6,11.2,1.25,1.85,35.7,-3.4,112.65,108.55,14.75,0.4
4,40600101,ORL,80,8.7,27.1,3.1,7.2,0.4185,2.0,4.8,...,9.25,8.7,1.15,0.4,30.8,-0.55,111.9,112.55,10.7,1.5


Utilizing that previous year metric, I take the previous year statistics for all of the players on a team that are hurt for a specific game and then create averaged statistics for them to get a gauge on the impact that the missing players have on their team's ability to win.

### Start Combining All Combined DataFrames to Get Picture of Game

In [None]:
tmp = pd.merge(box_stats, refs, on="game_id")

Matching specific games with the referees that officiated the game.

In [None]:
# Have to Match Team Statistics and Injured Players with Home and Away Teams
tmp1_home = tmp.merge(injured, how='left', left_on=['game_id', 'team_abbreviation_home'], right_on=['game_id', 'team_abbreviation'])
tmp1_away = tmp.merge(injured, how='left', left_on=['game_id', 'team_abbreviation_away'], right_on=['game_id', 'team_abbreviation'])

Created specific dataframes for the home team and away team for easier grouping. This will allow me to manipulate column labels later on. I also merged this with the injury statistics for who is going to be sitting.

In [None]:
# Change to String Objects
string_cols = ["season_id","team_id_home", "team_abbreviation_home", "game_id", "game_date", "matchup_home",
               "team_id_away","team_abbreviation_away",
               "team_name_away"]
tmp1_home[string_cols] = tmp1_home[string_cols].astype(str)
tmp1_away[string_cols] = tmp1_away[string_cols].astype(str)


In [None]:
tmp1_home.fillna(0, inplace=True)
tmp1_away.fillna(0, inplace=True)

Fill in NA values so that when I compile later, there won't be any errors

In [None]:
tmp2_home = tmp1_home.merge(team_stats, how="left", left_on=["team_id_home", "prev_season"], right_on=["sep_id", "season_year"])
tmp2_away = tmp1_away.merge(team_stats, how="left", left_on=["team_id_away", "prev_season"], right_on=["sep_id", "season_year"])

Tack on those previous year team statistics onto our temporarily merged data frame.

In [None]:
for col in tmp2_home.columns:
  if (col not in string_cols) & (col not in box_stats.columns) & (col not in refs.columns):
    tmp2_home.rename(columns={col: f"{col}_home"}, inplace=True)

for col in tmp2_away.columns:
  if (col not in string_cols) & (col not in box_stats.columns) & (col not in refs.columns):
    tmp2_away.rename(columns={col: f"{col}_away"}, inplace=True)

Prepared the two separate dataframes for merging by distinguishing which statistics were for each team. The home team's statistics for the team and hurt players will have `"_home"` and the away team's will have `"_away"`.

In [None]:
final = pd.merge(tmp2_home, tmp2_away, on=["game_id"])
final.sort_values(by="game_id")

Unnamed: 0,season_id_x,team_id_home_x,team_abbreviation_home_x,game_id,game_date_x,matchup_home_x,wl_home_x,min_x,fgm_home_x,fga_home_x,...,FTp_away,ORB_away,DRB_away,TRB_away,AST_away,STL_away,BLK_away,TOV_away,PF_away,PTS_away
0,42003,1610612754,IND,40300101,2004-04-17,IND vs. BOS,1,240,39.0,84.0,...,0.742,10.4,30.1,40.5,19.2,8.8,3.7,14.0,21.4,92.7
1,42003,1610612754,IND,40300102,2004-04-20,IND vs. BOS,1,240,41.0,92.0,...,0.742,10.4,30.1,40.5,19.2,8.8,3.7,14.0,21.4,92.7
2,42003,1610612738,BOS,40300103,2004-04-23,BOS vs. IND,0,240,32.0,76.0,...,0.766,12.2,32.0,44.2,23.3,8.5,5.4,14.8,22.1,96.8
3,42003,1610612738,BOS,40300104,2004-04-25,BOS vs. IND,0,240,27.0,67.0,...,0.766,12.2,32.0,44.2,23.3,8.5,5.4,14.8,22.1,96.8
4,42003,1610612751,NJN,40300112,2004-04-20,NJN vs. NYK,1,240,38.0,71.0,...,0.815,10.3,29.0,39.2,22.0,7.1,3.1,14.0,23.0,95.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,42022,1610612743,DEN,42200401,2023-06-01,DEN vs. MIA,1,240,40.0,79.0,...,0.808,9.8,33.9,43.7,25.5,7.4,3.2,14.6,20.5,110.0
1497,42022,1610612743,DEN,42200402,2023-06-04,DEN vs. MIA,0,240,39.0,75.0,...,0.808,9.8,33.9,43.7,25.5,7.4,3.2,14.6,20.5,110.0
1498,42022,1610612748,MIA,42200403,2023-06-07,MIA vs. DEN,0,240,34.0,92.0,...,0.795,9.2,34.9,44.1,27.8,7.2,3.7,14.5,20.0,112.7
1499,42022,1610612748,MIA,42200404,2023-06-09,MIA vs. DEN,0,240,35.0,78.0,...,0.795,9.2,34.9,44.1,27.8,7.2,3.7,14.5,20.0,112.7


The Final Accumulated Dataset for our analysis. I will have to go through it and determine which columns will be relevant features. For ex: some integers like `season_id` will have no bearing on the outcome of my analysis.

In [None]:
final.columns = final.columns.str.replace('_x', '').str.replace('_y', '')

Remove all suffixes that were added on from the final merge.

## Analysis and Model


In [None]:
final.head()

Unnamed: 0,season_id,team_id_home,team_abbreviation_home,game_id,game_date,matchup_home,wl_home,min,fgm_home,fga_home,...,FTp_away,ORB_away,DRB_away,TRB_away,AST_away,STL_away,BLK_away,TOV_away,PF_away,PTS_away
0,42003,1610612754,IND,40300101,2004-04-17,IND vs. BOS,1,240,39.0,84.0,...,0.742,10.4,30.1,40.5,19.2,8.8,3.7,14.0,21.4,92.7
1,42003,1610612754,IND,40300102,2004-04-20,IND vs. BOS,1,240,41.0,92.0,...,0.742,10.4,30.1,40.5,19.2,8.8,3.7,14.0,21.4,92.7
2,42003,1610612738,BOS,40300103,2004-04-23,BOS vs. IND,0,240,32.0,76.0,...,0.766,12.2,32.0,44.2,23.3,8.5,5.4,14.8,22.1,96.8
3,42003,1610612738,BOS,40300104,2004-04-25,BOS vs. IND,0,240,27.0,67.0,...,0.766,12.2,32.0,44.2,23.3,8.5,5.4,14.8,22.1,96.8
4,42003,1610612751,NJN,40300112,2004-04-20,NJN vs. NYK,1,240,38.0,71.0,...,0.815,10.3,29.0,39.2,22.0,7.1,3.1,14.0,23.0,95.9


In [None]:
#final.info(verbose=True)
final.fillna(0, inplace=True)
feat_cols = final.columns[56:61].tolist() + final.columns[81:113].tolist() + final.columns[117:133].tolist() + final.columns[135:156].tolist() + final.columns[272:288].tolist() + final.columns[289:].tolist()
features = final[feat_cols]
target = final.iloc[:, 6]

### Null Model
Assuming the home team always wins.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, random_state=1, test_size=0.10)

Split Dataset into X and Y with both training set and testing set (90% in train).

In [None]:
import numpy as np

null_accuracy = np.sum(target)/target.size

print(null_accuracy)

0.6282478347768155


Assuming that the home team wins every matchup, the model is 62.8% accurate. This is what we are striving to beat.

### Neural Network

In [None]:
import torch
from torch import nn

In [None]:
class my_model(torch.nn.Module):
  def __init__(self, hidden_size):
    super(my_model, self).__init__()
    self.linear1 = nn.Linear(117, hidden_size)
    self.dropout1 = nn.Dropout(0.5)
    self.linear2 = nn.Linear(hidden_size, hidden_size)
    self.dropout2 = nn.Dropout(0.25)
    self.linear3 = nn.Linear(hidden_size, hidden_size)
    self.dropout3 = nn.Dropout(0.25)
    self.linear4 = nn.Linear(hidden_size, hidden_size)
    self.linear5 = nn.Linear(hidden_size, hidden_size)
    self.linear6 = nn.Linear(hidden_size, hidden_size)
    self.linear7 = nn.Linear(hidden_size, 1)

  def forward(self, x):
      x = self.linear1(x)
      x = self.dropout1(x)
      x = nn.functional.relu(x)
      x = self.linear2(x)
      x = self.dropout2(x)
      x = nn.functional.tanh(x)
      x = self.dropout3(x)
      x = self.linear3(x)
      x = nn.functional.relu(x)
      x = self.linear4(x)
      x = nn.functional.tanh(x)
      x = self.linear5(x)
      x = nn.functional.relu(x)
      x = self.linear6(x)
      x = nn.functional.tanh(x)
      x = self.linear7(x)

      return(x)

Creating the structure of my neural network. It is five layers deep and uses RELU and tanh functions.

In [None]:
dataset = torch.utils.data.TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True,
                                         pin_memory=True, num_workers=2)

Creating batches and a shuffled dataset to increase some variance in the training data.

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('using',device)

using cpu


In [None]:
from sklearn.metrics import accuracy_score

hidden_size = 50
epochs = 10

model = my_model(hidden_size)

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, weight_decay=0.0000005)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)


for epoch in tqdm(range(epochs)):
  for x,y in dataloader:
    x = x.to(torch.float32)

    x = x.to(device)
    y = y.unsqueeze(1).to(device)


    optimizer.zero_grad()

    output = model.forward(x)
    loss = loss_fn(output, y.float())
    loss.backward()
    optimizer.step()

  scheduler.step()

  model.eval()
  with torch.no_grad():
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    test_output = model.forward(X_test_tensor)
    pred_y = (test_output > 0.5).float().cpu().numpy()
    test_acc = accuracy_score(y_test, pred_y)
    print(f"Epoch {epoch + 1}: Test Accuracy: {test_acc: .4f}")


 10%|█         | 1/10 [00:00<00:05,  1.80it/s]

Epoch 1: Test Accuracy:  0.3576


 20%|██        | 2/10 [00:01<00:04,  1.87it/s]

Epoch 2: Test Accuracy:  0.3576


 30%|███       | 3/10 [00:01<00:03,  2.21it/s]

Epoch 3: Test Accuracy:  0.6424


 40%|████      | 4/10 [00:01<00:02,  2.17it/s]

Epoch 4: Test Accuracy:  0.3576


 50%|█████     | 5/10 [00:02<00:02,  2.23it/s]

Epoch 5: Test Accuracy:  0.6424


 60%|██████    | 6/10 [00:02<00:01,  2.30it/s]

Epoch 6: Test Accuracy:  0.3576


 70%|███████   | 7/10 [00:03<00:01,  2.23it/s]

Epoch 7: Test Accuracy:  0.6424


 80%|████████  | 8/10 [00:03<00:00,  2.13it/s]

Epoch 8: Test Accuracy:  0.3576


 90%|█████████ | 9/10 [00:04<00:00,  2.11it/s]

Epoch 9: Test Accuracy:  0.6424


100%|██████████| 10/10 [00:04<00:00,  2.19it/s]

Epoch 10: Test Accuracy:  0.6424





The neural network outperforms the null model (albeit by ~2%).

The model undoubtedly has shortcomings. It is trained on only abou 1300 input vectors and is created via a foundation of home and away stats set in specific columns.

More data to train on, better final data structure, and enhanced model architecture should be explored.