In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
/kaggle/input/nba-player-season-statistics-with-mvp-win-share/NBA_Dataset.csv

In [None]:
data = pd.read_csv('/kaggle/input/nba-player-season-statistics-with-mvp-win-share/NBA_Dataset.csv')

In [None]:
data.fillna(0)

In [None]:
#Adding MVP column based on the player (by season) with the most Award Shares (MVP vote percentage)
mvplist = data.loc[data.groupby('season')['award_share'].idxmax()]
mvplist["mvp"] = True
data = data.merge(mvplist[["season", "award_share", "mvp"]], on=["season", "award_share"], how="left")
data['mvp'] = data['mvp'].astype('boolean').fillna(False)

In [None]:
#Narrowing Dataset by Setting Guidlines: (Very Simple Guidlies so we can not only see MVP winners in the dataset, but also all other notable NBA players in a season.)
    # - All MVPs must have played more than 20 games
    # - Must have averaged more than 10 minutes a game
    # - Scored more than 5 pts per game
data =  data[data["gs"] > 20]
data = data[data['mp_per_g'] > 10]
data = data[data['pts_per_g'] > 5]

data[data['mvp'] == True]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import random 

#Getting rid of unecessary columns and turning the 'mvp' column into binary
mvpfactors = data.copy()
mvpfactors = pd.concat([mvpfactors, pd.get_dummies(mvpfactors['mvp'], prefix='Is_MVP')], axis=1)
mvpfactors.drop(columns=['mvp', 'season', 'Is_MVP_False', 'pos', 'team_id'], inplace=True)

In [None]:
#Modeling different stats and their correlation to winning MVP
# - First, looking at Value Over Replacement (VORP) to Award Shares
plt.figure(figsize = (10,6))
plt.scatter(data['vorp'], data['award_share'])
plt.title('MVP Award Shares vs. Value Over Replacement (2022 Season)')
plt.xlabel('Vorp')
plt.ylabel('Award Shares')
plt.show()

In [None]:
#Looking at a specific players stats in a Radar Chart, in this case, the MVP Nikola Jokic

player_name = 'Nikola Jokić'
mvp_data = mvpfactors[mvpfactors['player'] == player_name].iloc[0]
stats = ['fg_pct', 'trb_pct', 'ast_pct', 'stl_pct', 'blk_pct', 'tov_pct', 'usg_pct']
values = [mvp_data[stat] for stat in stats]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw={'projection': 'polar'})
theta = np.linspace(0, 2 * np.pi, len(stats), endpoint=False)
ax.plot(theta, values, 'o-', linewidth=2)
ax.fill(theta, values, alpha=0.2)
ax.set_thetagrids(theta * 180 / np.pi, stats)
ax.set_title(f"{player_name}'s Statistical Profile")
plt.show()

In [None]:
# Assuming mvpfactors DataFrame is already defined and processed
mvpfactors = mvpfactors.drop(columns=['award_share', 'player'])
corr_matrix = mvpfactors.corr()

# Get the top 20 factors correlated with the MVP variable
top_factors = corr_matrix['Is_MVP_True'].abs().sort_values(ascending=False).head(20).index

# Create a new correlation matrix for the top 20 factors
sorted_corr_matrix = corr_matrix.loc[top_factors, top_factors]

# Seaborn Heatmap of the Top 20 Correlation Matrix
fig, ax = plt.subplots(figsize=(12, 12))  # Adjust size for clarity
sns.heatmap(
    sorted_corr_matrix,
    vmin=-1, vmax=1,
    cmap="ocean",
    center=0,
    annot=True,
    fmt=".2f",
    annot_kws={"fontsize": 8},
    linewidths=0.5,
    linecolor="white",
    cbar=True,
    cbar_kws={"orientation": "vertical"},
    square=True,
    xticklabels=True,
    yticklabels=True,
    ax=ax
)
plt.title("Correlation Matrix of Top 20 Factors Related to MVP Award")
plt.show()

In [None]:
#Predictors we will use for our model
predictors = sorted_corr_matrix.index[:21]
predictors = predictors[predictors != 'Is_MVP_True']
predictors

In [None]:
#Adding Season, Player, and Award Share data back for the purpose of training
mvpfactors = mvpfactors.assign(Player=data['player'], Award_Shares=data['award_share'], Season=data['season'])

#The data will be trained on the data from NBA seasons 1982-2021 and will predict the 2022 MVP voting
train = mvpfactors[mvpfactors["Season"] < 2022]
test = mvpfactors[mvpfactors["Season"] == 2022]

#Trying Ridge Regression (Train/Test Split)
from sklearn.linear_model import Ridge

reg = Ridge(alpha=0.1)
reg.fit(train[predictors], train["Award_Shares"])
predictions = reg.predict(test[predictors])
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
combination = pd.concat([test[["Player","Award_Shares"]], predictions], axis=1)

#Adding a column for the actual results of 2022 MVP voting
combination = combination.sort_values("Award_Shares", ascending=False)
combination["Rk"] = list(range(1, combination.shape[0] + 1))

#Adding a column for the predicted results of the MVP voting
combination = combination.sort_values("predictions", ascending=False)
combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))

combination = combination.sort_values("Award_Shares", ascending=False)
combination.head(40)

In [None]:
#Mean Squared Error to test how accurate our model is, a lower score is preferable
from sklearn.metrics import mean_squared_error
mean_squared_error(combination["Award_Shares"], combination["predictions"])

In [None]:
#Average prescion of the predicted rankings (how far off they were from the actual ranking)
# - The closer the score is to one, the more accurate it is
def find_ap(combination):
    
    # Sort the actual and predicted rankings
    actual = combination.sort_values("Award_Shares", ascending=False).head(5)
    predicted = combination.sort_values("predictions", ascending=False)

    # Calculate the average precision
    ps = []
    found = 0
    seen = 1
    for _, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found / seen)
        seen += 1

    if not ps:
        return 0.0
    else:
        return sum(ps) / len(ps)
    
find_ap(combination)

In [None]:
#Testing to see how accurate the model is after using predictions from 5 years (1991-1995)

years = list(range(1991,2022))
aps = []
all_predictions = []
for year in years[5:]:
    train = mvpfactors[mvpfactors["Season"] < year]
    test = mvpfactors[mvpfactors["Season"] == year]
    reg.fit(train[predictors],train["Award_Shares"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
    combination = pd.concat([test[["Player", "Award_Shares"]], predictions], axis=1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

sum(aps) / len(aps)

In [None]:
#Find differences between MVP voting rank and predicted rank
def add_ranks(predictions):
    predictions = predictions.sort_values("predictions", ascending=False)
    predictions["Predicted_Rk"] = list(range(1,predictions.shape[0]+1))
    predictions = predictions.sort_values("Award_Shares", ascending=False)
    predictions["Rk"] = list(range(1,predictions.shape[0]+1))
    predictions["Diff"] = (predictions["Rk"] - predictions["Predicted_Rk"])
    return predictions
add_ranks(all_predictions[1])

In [None]:
#Backtest model for all years
def backtest(mvpfactors, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = mvpfactors[mvpfactors["Season"] < year]
        test = mvpfactors[mvpfactors["Season"] == year]
        model.fit(train[predictors],train["Award_Shares"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Award_Shares"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)


In [None]:
#mean_ap for backtest model
mean_ap, aps, all_predictions = backtest(mvpfactors, reg, years[5:], predictors)
mean_ap

In [None]:
#Sorting by the greatest difference between predicted rank and actual rank in top 5 MVP finishers
all_predictions[all_predictions["Rk"] < 5].sort_values("Diff").head(10)

In [None]:
#Figuring out which stats the Ridge Regression Model cares the most about
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

rftrain = mvpfactors[mvpfactors['Season'] < 2022]
rftest = mvpfactors[mvpfactors['Season'] == 2022]

rf.fit(rftrain[predictors], rftrain["Award_Shares"])
rfpredictions = rf.predict(rftest[predictors])
rfpredictions = pd.DataFrame(rfpredictions, columns=["rfpredictions"], index=rftest.index)
rfcombination = pd.concat([rftest[["Player", "Award_Shares"]], rfpredictions], axis=1)

rfcombination = rfcombination.sort_values("Award_Shares", ascending=False)
rfcombination["Actual_Rank"] = list(range(1, rfcombination.shape[0] + 1))
rfcombination = rfcombination.sort_values("rfpredictions", ascending=False)
rfcombination["Predicted_Rank"] = list(range(1, rfcombination.shape[0] + 1))
rfcombination = rfcombination.sort_values("Award_Shares", ascending=False)
rfcombination.head(40)

In [None]:
#Mean Squared Error for Random Forest Regession
mean_squared_error(rfcombination["Award_Shares"], rfcombination["rfpredictions"])