# PREDICTING NEXT YEAR'S TABLE

## CLEANING THE DATA

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [4]:
# Load the CSV file into a DataFrame
df = pd.read_csv('nhlpractice.csv')

# Remove unnecessary columns (e.g., 'GP' might not be needed for predictions)
df = df.drop(columns=['GP','TOI/GP','ROW'])

# Handle missing data (you can drop rows with missing data or fill them with average values)
df = df.dropna()  # Or df.fillna(df.mean(), inplace=True) to fill missing values with the mean

df.head()

Unnamed: 0,TeamA,Team,W,L,OTL,Points,Point %
0,BB,Boston Bruins,65,12,5,135,0.823
1,CH,Carolina Hurricanes,52,21,9,113,0.689
2,NJD,New Jersey Devils,52,22,8,112,0.683
3,VGK,Vegas Golden Knights,51,22,9,111,0.677
4,TML,Toronto Maple Leafs,50,21,11,111,0.677


## MAKING PREDICTION

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score

In [13]:
# Create new features
df['WinRate'] = df['W'] / 82
df['LossRate'] = df['L'] / 82
df['OTLRate'] = df['OTL'] / 82

# Define features and target variable
X = df[['WinRate', 'LossRate', 'OTLRate', 'Point%']]  # You can add other features here
y = df['Points']

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Initialize and train the Random Forest model
model = LinearRegression()
model.fit(X_train, y_train)

In [25]:
# Make predictions for the test set
y_pred = model.predict(X_test)

In [30]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

Mean Absolute Error: 2.334640417497472e-14
R-squared: 1.0


In [31]:
# Predict next year's points for all teams
df['Predicted_Points'] = model.predict(X)

# Sort teams by predicted points
df_sorted = df[['TeamA', 'Predicted_Points']].sort_values(by='Predicted_Points', ascending=False)
print(df_sorted)

   TeamA  Predicted_Points
0     BB             135.0
1     CH             113.0
2    NJD             112.0
4    TML             111.0
3    VGK             111.0
6     EO             109.0
5     CA             109.0
7     DS             108.0
8    NYR             107.0
9    LAK             104.0
10   MIW             103.0
11   SEK             100.0
12   TBL              98.0
13   WIJ              95.0
15   CAF              93.0
14   NYI              93.0
17   FLP              92.0
16   NAP              92.0
18    PP              91.0
19    BS              91.0
20    OS              86.0
21    VC              83.0
22   SLB              81.0
23    WC              80.0
24   DRW              80.0
25    PF              75.0
26   AZC              70.0
27    MC              68.0
28   SJS              60.0
29   CBJ              59.0
30   CBL              59.0
31    AD              58.0
