In [10]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

In [11]:
df = pd.read_csv(
    "/home/artemiy/rink_master/rink_master_data/rink_master_47816_wteams.csv",
    index_col=False,
)
df.head()

Unnamed: 0,gameID,gameDate,Team,Opponent,homeRoad,gamesPlayed,Win,Loss,Tie,OTLoss,points,pointPct,regulationWins,regulationAndOtWins,shootoutWins,goalsFor,goalsAgainst,goalsForPerGame,goalsAgainstPerGame,powerPlayPct,penaltyKillPct,powerPlayNetPct,penaltyKillNetPct,shotsForPerGame,shotsAgainstPerGame,faceoffWinPct
0,2003020001,2003-10-08,New Jersey Devils,Boston Bruins,1,1,0,0,1,0,1,0.5,0,0,0,3,3,3.0,3.0,0.75,0.75,0.75,0.75,35.0,31.0,0.43
1,2003020003,2003-10-08,Minnesota Wild,Chicago Blackhawks,1,1,0,1,0,0,0,0.0,0,0,0,0,1,0.0,1.0,0.0,1.0,0.0,1.0,33.0,22.0,0.57
2,2003020002,2003-10-08,Dallas Stars,Anaheim Ducks,0,1,1,0,0,0,2,1.0,1,1,0,4,1,4.0,1.0,0.33,0.75,0.33,0.75,22.0,21.0,0.53
3,2003020001,2003-10-08,Boston Bruins,New Jersey Devils,0,1,0,0,1,0,1,0.5,0,0,0,3,3,3.0,3.0,0.25,0.25,0.25,0.25,31.0,35.0,0.57
4,2003020003,2003-10-08,Chicago Blackhawks,Minnesota Wild,0,1,1,0,0,0,2,1.0,1,1,0,1,0,1.0,0.0,0.0,1.0,0.0,1.0,22.0,33.0,0.43


In [12]:
pd.set_option("display.max_columns", None)

In [13]:
df["gameDate"] = pd.to_datetime(df["gameDate"])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47816 entries, 0 to 47815
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   gameID               47816 non-null  int64         
 1   gameDate             47816 non-null  datetime64[ns]
 2   Team                 47816 non-null  object        
 3   Opponent             47816 non-null  object        
 4   homeRoad             47816 non-null  int64         
 5   gamesPlayed          47816 non-null  int64         
 6   Win                  47816 non-null  int64         
 7   Loss                 47816 non-null  int64         
 8   Tie                  47816 non-null  int64         
 9   OTLoss               47816 non-null  int64         
 10  points               47816 non-null  int64         
 11  pointPct             47816 non-null  float64       
 12  regulationWins       47816 non-null  int64         
 13  regulationAndOtWins  47816 non-

In [15]:
encoder = OneHotEncoder()
encoded_teams = encoder.fit_transform(df[["Team", "Opponent"]]).toarray()
feature_names = encoder.get_feature_names_out(["Team", "Opponent"])
encoded_teams_df = pd.DataFrame(encoded_teams, columns=feature_names)

In [16]:
df = pd.concat([df, encoded_teams_df], axis=1)
df.drop(["Team", "Opponent"], axis=1, inplace=True)

In [17]:
# Обработка результатов игры
df["Result"] = df.apply(
    lambda row: "Win" if row["Win"] == 1 else ("Tie" if row["Tie"] == 1 else "Loss"),
    axis=1,
)
df["Result"] = df["Result"].map({"Win": 1, "Loss": 0, "Tie": 0})

In [18]:
train = df[df["gameDate"] < "2023-10-10"]
test = df[df["gameDate"] >= "2023-10-10"]

In [19]:
print(train["Result"].value_counts())
print(test["Result"].value_counts())

Result
0    22766
1    22426
Name: count, dtype: int64
Result
0    1312
1    1312
Name: count, dtype: int64


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(max_iter=1000, random_state=1)
lr.fit(train.drop(columns=["Result", "gameDate"]), train["Result"])
preds_lr = lr.predict(test.drop(columns=["Result", "gameDate"]))
accuracy_lr = accuracy_score(test["Result"], preds_lr)
print(f"Точность предсказаний (Logistic Regression): {accuracy_lr * 100:.2f}%")

Точность предсказаний (Logistic Regression): 50.00%


In [21]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=1, min_samples_split=10)
rf.fit(train.drop(columns=["Result", "gameDate"]), train["Result"])
preds_rf = rf.predict(test.drop(columns=["Result", "gameDate"]))
accuracy_rf = accuracy_score(test["Result"], preds_rf)
print(f"Точность предсказаний (Random Forest): {accuracy_rf * 100:.2f}%")

Точность предсказаний (Random Forest): 100.00%


In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
}

grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=1), param_grid, cv=5, scoring="accuracy"
)
grid_rf.fit(train.drop(columns=["Result", "gameDate"]), train["Result"])
print(f"Лучшие параметры: {grid_rf.best_params_}")
preds_grid_rf = grid_rf.predict(test.drop(columns=["Result", "gameDate"]))
accuracy_grid_rf = accuracy_score(test["Result"], preds_grid_rf)
print(f"Точность предсказаний (Grid Search RF): {accuracy_grid_rf * 100:.2f}%")

Лучшие параметры: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Точность предсказаний (Grid Search RF): 100.00%
