# Random Forest Regressor Model
### NBA Playoff Predictor

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import copy

In [2]:
pd.set_option('display.max_columns', None)
ALL_SEASONS = "seasons_data.csv"
CUR_SEASON = "2023_data.csv"

### Read all data

In [3]:
# read in our csv's of data
all_df = pd.read_csv(ALL_SEASONS)
df_23 = pd.read_csv(CUR_SEASON)

# concat the data
df = pd.concat([all_df, df_23], ignore_index=True)

# drop minutes played and attendance per game
all_df.drop(['MP', 'Attend./G'], axis=1, inplace=True)
df_23.drop(['MP', 'Attend./G'], axis=1, inplace=True)

## Make model using all attributes

In [4]:
# split the data using all features
features = all_df[all_df.columns[1:-2]]
target = all_df[['Playoff']]

X = features  # get the input features
y = target  # get the target

X_train, X_test, y_train, y_test = train_test_split(X,  # the input features
                                                    y,  # the label
                                                    test_size=0.3,  # set aside 30% of the data as the test set
                                                    random_state=7  # reproduce the results
                                                     )

In [5]:
# make random forest regressor model with all features
rf = RandomForestRegressor(random_state=7)
rf.fit(X_train, y_train)

# predict the labels for the test set
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# Evaluate the Predictions
print('The mse of the model is: {}'.format(mse))

  rf.fit(X_train, y_train)


The mse of the model is: 0.028745459401709406


### Determine feature importance

In [6]:
# Determine feature importance, identify features
importances = pd.DataFrame({'stat': features.columns,
                            'value': rf.feature_importances_})
importances.sort_values('value', ascending=False, inplace=True)
importances.head()

Unnamed: 0,stat,value
33,win_perc,0.665695
16,SOS,0.03237
20,NRtg,0.02723
15,MOV,0.022611
14,PL,0.02184


## Make model using top attributes from past model

In [7]:
# split the data with top 10 features
features = all_df[importances.loc[:11, 'stat']]
target = all_df[['Playoff']]

X = features  # get the input features
y = target  # get the target

X_train, X_test, y_train, y_test = train_test_split(X,  # the input features
                                                    y,  # the label
                                                    test_size=0.3,  # set aside 30% of the data as the test set
                                                    random_state=7  # reproduce the results
                                                     )

In [8]:
# make random forest regressor model with top 10 features
rf = RandomForestRegressor(random_state=7)
rf.fit(X_train, y_train)

# predict the labels for the test set
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# Evaluate the Predictions
print('The mse of the model is: {}'.format(mse))

  rf.fit(X_train, y_train)


The mse of the model is: 0.029344604700854697


### Results
This model performs slightly worse than the model using all the attributes (higher MSE).

## Use better model to predict outcome of current teams

In [9]:
# split the data using all features
features = all_df[all_df.columns[1:-2]]
target = all_df[['Playoff']]

X = features  # get the input features
y = target  # get the target

X_train, X_test, y_train, y_test = train_test_split(X,  # the input features
                                                    y,  # the label
                                                    test_size=0.3,  # set aside 30% of the data as the test set
                                                    random_state=7  # reproduce the results
                                                     )

In [10]:
# make random forest regressor model with all features
rf = RandomForestRegressor(random_state=7)
rf.fit(X_train, y_train)

# predict the labels for the 2023 teams
df_23_new = copy.deepcopy(df_23)
teams = df_23.loc[:, 'Team']

df_23_new.drop(['Team', 'Year'], axis=1, inplace=True)
y_pred = rf.predict(df_23_new)

  rf.fit(X_train, y_train)


In [11]:
# match outcome with team, place in descending order
outcome = pd.DataFrame({'Team': teams,
                        'Playoff': y_pred})
outcome.sort_values('Playoff', ascending=False, inplace=True, ignore_index=True)
outcome

Unnamed: 0,Team,Playoff
0,Milwaukee Bucks,0.735
1,Denver Nuggets,0.69
2,Boston Celtics,0.675
3,Cleveland Cavaliers,0.615
4,Philadelphia 76ers,0.605
5,Memphis Grizzlies,0.56
6,Sacramento Kings,0.5325
7,Los Angeles Clippers,0.365
8,Brooklyn Nets,0.3575
9,New York Knicks,0.3375
