<a href="https://colab.research.google.com/github/ccstevie/nhl_model/blob/main/model2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
import requests
import pandas as pd
from datetime import date

In [56]:
url = "https://www.naturalstattrick.com/games.php?fromseason=20212022&thruseason=20232024&stype=2&sit=5v5&loc=B&team=All&rate=n"
req = requests.get(url)
req.status_code

200

In [57]:
df = pd.read_html(url, header=0, na_values=["-"])[0]
df.drop(columns=["Unnamed: 2", "TOI", "Attendance","LDGF%","MDGF%"], inplace=True)
df.dropna(inplace=True)
df

Unnamed: 0,Game,Team,CF,CA,CF%,FF,FA,FF%,SF,SA,...,LDSF,LDSA,LDSF%,LDGF,LDGA,LDSH%,LDSV%,SH%,SV%,PDO
0,"2021-10-12 - Penguins 6, Lightning 2",Pittsburgh Penguins,54,47,53.47,44,33,57.14,32,23,...,9,12,42.86,1,0,11.11,100.00,9.38,100.00,1.094
1,"2021-10-12 - Penguins 6, Lightning 2",Tampa Bay Lightning,47,54,46.53,33,44,42.86,23,32,...,12,9,57.14,0,1,0.00,88.89,0.00,90.63,0.906
2,"2021-10-12 - Kraken 3, Golden Knights 4",Seattle Kraken,51,43,54.26,39,33,54.17,26,21,...,14,11,56.00,1,0,7.14,100.00,11.54,80.95,0.925
3,"2021-10-12 - Kraken 3, Golden Knights 4",Vegas Golden Knights,43,51,45.74,33,39,45.83,21,26,...,11,14,44.00,0,1,0.00,92.86,19.05,88.46,1.075
4,"2021-10-13 - Canadiens 1, Maple Leafs 2",Montreal Canadiens,42,54,43.75,31,43,41.89,25,24,...,7,6,53.85,0,0,0.00,100.00,4.00,95.83,0.998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7007,"2024-02-20 - Canucks 1, Avalanche 3",Vancouver Canucks,43,34,55.84,28,24,53.85,22,16,...,11,7,61.11,0,1,0.00,85.71,4.55,87.50,0.920
7008,"2024-02-20 - Predators 5, Golden Knights 3",Nashville Predators,64,28,69.57,47,21,69.12,32,16,...,17,6,73.91,1,1,5.88,83.33,12.50,87.50,1.000
7009,"2024-02-20 - Predators 5, Golden Knights 3",Vegas Golden Knights,28,64,30.43,21,47,30.88,16,32,...,6,17,26.09,1,1,16.67,94.12,12.50,87.50,1.000
7010,"2024-02-20 - Blue Jackets 1, Kings 5",Columbus Blue Jackets,49,51,49.00,33,43,43.42,25,33,...,12,12,50.00,1,1,8.33,91.67,4.00,90.91,0.949


In [58]:
df["Game"] = df["Game"].str[:10]
df["Result"] = df["GF"]-df["GA"]
del df["GF"]
del df["GA"]
df["Result"]

0       3
1      -3
2      -1
3       1
4       0
       ..
7007   -1
7008    2
7009   -2
7010   -2
7011    2
Name: Result, Length: 6244, dtype: int64

In [59]:
df.columns

Index(['Game', 'Team', 'CF', 'CA', 'CF%', 'FF', 'FA', 'FF%', 'SF', 'SA', 'SF%',
       'GF%', 'xGF', 'xGA', 'xGF%', 'SCF', 'SCA', 'SCF%', 'HDCF', 'HDCA',
       'HDCF%', 'HDSF', 'HDSA', 'HDSF%', 'HDGF', 'HDGA', 'HDGF%', 'HDSH%',
       'HDSV%', 'MDCF', 'MDCA', 'MDCF%', 'MDSF', 'MDSA', 'MDSF%', 'MDGF',
       'MDGA', 'MDSH%', 'MDSV%', 'LDCF', 'LDCA', 'LDCF%', 'LDSF', 'LDSA',
       'LDSF%', 'LDGF', 'LDGA', 'LDSH%', 'LDSV%', 'SH%', 'SV%', 'PDO',
       'Result'],
      dtype='object')

In [60]:
# learning machine learning from https://www.youtube.com/watch?v=egTylm6C2is
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler

rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=10, direction="forward", cv=split)

In [61]:
remove = ["Game", "Team", "Result"]
select = df.columns[~df.columns.isin(remove)]

In [62]:
scaler = MinMaxScaler()
df[select] = scaler.fit_transform(df[select])

In [63]:
df

Unnamed: 0,Game,Team,CF,CA,CF%,FF,FA,FF%,SF,SA,...,LDSA,LDSF%,LDGF,LDGA,LDSH%,LDSV%,SH%,SV%,PDO,Result
0,2021-10-12,Pittsburgh Penguins,0.514286,0.438356,0.565869,0.574074,0.392857,0.628510,0.562500,0.375000,...,0.366667,0.417609,0.25,0.00,0.166642,1.000000,0.23450,1.00000,0.638519,3
1,2021-10-12,Tampa Bay Lightning,0.414286,0.534247,0.434131,0.370370,0.589286,0.371490,0.375000,0.562500,...,0.266667,0.582391,0.00,0.25,0.000000,0.833358,0.00000,0.76575,0.360000,-3
2,2021-10-12,Seattle Kraken,0.471429,0.383562,0.580866,0.481481,0.392857,0.575054,0.437500,0.333333,...,0.333333,0.569236,0.25,0.00,0.107095,1.000000,0.28850,0.52375,0.388148,-1
3,2021-10-12,Vegas Golden Knights,0.357143,0.493151,0.419134,0.370370,0.500000,0.424946,0.333333,0.437500,...,0.433333,0.430764,0.00,0.25,0.000000,0.892905,0.47625,0.71150,0.610370,1
4,2021-10-13,Montreal Canadiens,0.342857,0.534247,0.381359,0.333333,0.571429,0.354032,0.416667,0.395833,...,0.166667,0.544426,0.00,0.00,0.000000,1.000000,0.10000,0.89575,0.496296,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7007,2024-02-20,Vancouver Canucks,0.357143,0.260274,0.610858,0.277778,0.232143,0.569294,0.354167,0.229167,...,0.200000,0.628202,0.00,0.25,0.000000,0.785661,0.11375,0.68750,0.380741,-1
7008,2024-02-20,Nashville Predators,0.657143,0.178082,0.871488,0.629630,0.178571,0.844132,0.562500,0.229167,...,0.166667,0.775906,0.25,0.25,0.088196,0.749963,0.31250,0.68750,0.499259,2
7009,2024-02-20,Vegas Golden Knights,0.142857,0.671233,0.128512,0.148148,0.642857,0.155868,0.229167,0.562500,...,0.533333,0.224094,0.25,0.25,0.250037,0.911804,0.31250,0.68750,0.499259,-2
7010,2024-02-20,Columbus Blue Jackets,0.442857,0.493151,0.481017,0.370370,0.571429,0.381569,0.416667,0.583333,...,0.366667,0.500000,0.25,0.25,0.124944,0.875056,0.10000,0.77275,0.423704,-2


In [64]:
sfs.fit(df[select], df["Result"])

In [65]:
predictors = list(select[sfs.get_support()])

In [66]:
predictors

['SF%', 'GF%', 'xGF', 'HDGF', 'HDSH%', 'LDCA', 'LDSA', 'LDSV%', 'SH%', 'SV%']

I think these features are off because we normalized values to be between 0-1 but the result is actually any integer.