In [1]:
import pandas as pd


# Load Data



In [119]:
team_data = pd.read_csv('Team Summaries.csv')
player_data = pd.read_csv('Player Totals.csv')
award_data = pd.read_csv('Player Award Shares.csv')

award_data = award_data[award_data['award'] == 'nba mvp']



# I want to merge the data to include wins, losses, net_rating, offense/defense of rating as I feel it is important

In [120]:
# Remove whitespace if any
team_data['abbreviation'] = team_data['abbreviation'].str.strip()
player_data['tm'] = player_data['tm'].str.strip()


# Clean player names: strip spaces, normalize casing
award_data['player'] = award_data['player'].str.strip()
player_data['player'] = player_data['player'].str.strip()


In [121]:
team_stats = team_data[['season', 'abbreviation', 'w', 'l', 'o_rtg', 'd_rtg', 'n_rtg']]

# Merge the data 

In [122]:
merged_data = pd.merge(
    player_data,
    team_stats,
    how='left',
    left_on=['season', 'tm'], # merge on team for player stats
    right_on=['season', 'abbreviation'] # merge on team abbreviation for team stats
)

# Faced problem of duplicate player names in the award data
# By keeping TOT rows and removing duplicates, we can ensure that we have the correct player
traded = merged_data[merged_data['tm'] == 'TOT'][['player_id', 'season']]

# Filter out all rows for those player-season combos that are NOT 'TOT'
filtered_df = merged_data.merge(
    traded,
    on=['player_id', 'season'],
    how='left',
    indicator=True
)

# - Only 'TOT' rows for players who have a TOT entry
# - All rows for players without a TOT entry
final_df = filtered_df[
    (filtered_df['tm'] == 'TOT') |  # keep the TOT row
    (filtered_df['_merge'] == 'left_only')  # players with no TOT at all
].drop(columns=['_merge'])

final = pd.merge(
    final_df,
    award_data[['season', 'player', 'share']],  # Only bring the share column
    on=['season', 'player'],
    how='left'  # Keep all players
)

final.to_csv('Final.csv', index= False)

# Handle missing data


In [123]:
pd.isnull(final).sum()

seas_id             4
season              0
player_id           4
player              0
birth_year      24118
pos                 0
age                18
experience          0
lg                  0
tm                  0
g                   0
gs               6978
mp                792
fg                  0
fga                 0
fg_percent         95
x3p              5199
x3pa             5199
x3p_percent      8251
x2p                 0
x2pa                0
x2p_percent       149
e_fg_percent       95
ft                  0
fta                 0
ft_percent        816
orb              3788
drb              3788
trb               657
ast                 0
stl              4512
blk              4511
tov              4635
pf                  0
pts                 0
abbreviation     2962
w                2966
l                2966
o_rtg            3510
d_rtg            3510
n_rtg            3510
share           25349
dtype: int64

In [124]:
final[pd.isnull(final["x3p_percent"])][["player", "x3pa"]]

Unnamed: 0,player,x3pa
8,Adama Sanogo,0.0
9,Adem Bona,0.0
16,Alex Reese,0.0
32,Ariel Hukporti,0.0
40,Ben Simmons,0.0
...,...,...
26390,Walt Miller,
26391,Warren Fenley,
26392,Wilbert Kautz,
26393,Woody Grimshaw,


In [125]:
final = final.fillna(0)
pd.isnull(final).sum()

seas_id         0
season          0
player_id       0
player          0
birth_year      0
pos             0
age             0
experience      0
lg              0
tm              0
g               0
gs              0
mp              0
fg              0
fga             0
fg_percent      0
x3p             0
x3pa            0
x3p_percent     0
x2p             0
x2pa            0
x2p_percent     0
e_fg_percent    0
ft              0
fta             0
ft_percent      0
orb             0
drb             0
trb             0
ast             0
stl             0
blk             0
tov             0
pf              0
pts             0
abbreviation    0
w               0
l               0
o_rtg           0
d_rtg           0
n_rtg           0
share           0
dtype: int64

##### Although filling everything with 0 is not ideal for the purpose of this model it is fine. The specific stat that benefits is 3 point % as many are missing values because players did not attempt a 3

<span style ="font-size: 24px; font-family:Verdana"> Choosing the columns to keep </span>

In [126]:
final.columns

Index(['seas_id', 'season', 'player_id', 'player', 'birth_year', 'pos', 'age',
       'experience', 'lg', 'tm', 'g', 'gs', 'mp', 'fg', 'fga', 'fg_percent',
       'x3p', 'x3pa', 'x3p_percent', 'x2p', 'x2pa', 'x2p_percent',
       'e_fg_percent', 'ft', 'fta', 'ft_percent', 'orb', 'drb', 'trb', 'ast',
       'stl', 'blk', 'tov', 'pf', 'pts', 'abbreviation', 'w', 'l', 'o_rtg',
       'd_rtg', 'n_rtg', 'share'],
      dtype='object')

In [127]:
predictors = ['age',
       'experience', 'g', 'gs', 'mp', 'fg', 'fga', 'fg_percent',
       'x3p', 'x3pa', 'x3p_percent', 'x2p', 'x2pa', 'x2p_percent',
       'e_fg_percent', 'ft', 'fta', 'ft_percent', 'orb', 'drb', 'trb', 'ast',
       'stl', 'blk', 'tov', 'pf', 'pts', 'w', 'l', 'o_rtg',
       'd_rtg', 'n_rtg']

<span style ="font-size: 24px; font-family:Verdana"> Training and Test data </span>

***

In [128]:
train_data = final[final['season'] < 2020]
test_data = final[final['season'] == 2021]

In [129]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1) #The alpha parameter is the regularization strength. A higher value means more regularization.

In [130]:
reg.fit(train_data[predictors], train_data["share"])

In [131]:
predictions = reg.predict(test_data[predictors])
predictions = pd.DataFrame(predictions, columns=["predicted_share"], index=test_data.index) 
predictions

Unnamed: 0,predicted_share
2256,0.020563
2257,-0.000971
2258,-0.007589
2259,0.009086
2260,-0.001441
...,...
2791,0.000816
2792,0.002127
2793,0.081361
2794,0.002957


In [132]:
combination = pd.concat([test_data[["player", "share"]], predictions], axis=1)

In [133]:
combination

Unnamed: 0,player,share,predicted_share
2256,Aaron Gordon,0.0,0.020563
2257,Aaron Holiday,0.0,-0.000971
2258,Aaron Nesmith,0.0,-0.007589
2259,Abdel Nader,0.0,0.009086
2260,Adam Mokoka,0.0,-0.001441
...,...,...,...
2791,Yogi Ferrell,0.0,0.000816
2792,Yuta Watanabe,0.0,0.002127
2793,Zach LaVine,0.0,0.081361
2794,Zeke Nnaji,0.0,0.002957


In [134]:
combination.sort_values(by="share", ascending=False).head(20)

Unnamed: 0,player,share,predicted_share
2655,Nikola Jokić,0.961,0.163042
2516,Joel Embiid,0.58,0.122991
2726,Stephen Curry,0.449,0.138977
2440,Giannis Antetokounmpo,0.345,0.176619
2333,Chris Paul,0.138,0.082006
2595,Luka Dončić,0.042,0.153657
2351,Damian Lillard,0.038,0.130474
2537,Julius Randle,0.02,0.100385
2384,Derrick Rose,0.01,0.031014
2706,Rudy Gobert,0.008,0.120627


In [135]:
from sklearn.metrics import mean_squared_error

mean_squared_error(test_data["share"], combination["predicted_share"])

0.0025347920763874056

In [136]:
combination["share"].value_counts()

share
0.000    525
0.001      3
0.003      1
0.138      1
0.038      1
0.010      1
0.345      1
0.580      1
0.020      1
0.042      1
0.961      1
0.008      1
0.005      1
0.449      1
Name: count, dtype: int64

In [137]:
combination = combination.sort_values(by="share", ascending=False)
combination["Rank"] = list(range(1, combination.shape[0] + 1))

In [138]:
combination.head(20)

Unnamed: 0,player,share,predicted_share,Rank
2655,Nikola Jokić,0.961,0.163042,1
2516,Joel Embiid,0.58,0.122991,2
2726,Stephen Curry,0.449,0.138977,3
2440,Giannis Antetokounmpo,0.345,0.176619,4
2333,Chris Paul,0.138,0.082006,5
2595,Luka Dončić,0.042,0.153657,6
2351,Damian Lillard,0.038,0.130474,7
2537,Julius Randle,0.02,0.100385,8
2384,Derrick Rose,0.01,0.031014,9
2706,Rudy Gobert,0.008,0.120627,10


In [139]:
combination = combination.sort_values(by="predicted_share", ascending=False)
combination["Predicted_rank"] = list(range(1, combination.shape[0] + 1)) 

combination.head(10)

Unnamed: 0,player,share,predicted_share,Rank,Predicted_rank
2440,Giannis Antetokounmpo,0.345,0.176619,4,1
2655,Nikola Jokić,0.961,0.163042,1,2
2595,Luka Dončić,0.042,0.153657,6,3
2726,Stephen Curry,0.449,0.138977,3,4
2351,Damian Lillard,0.038,0.130474,7,5
2708,Russell Westbrook,0.005,0.127979,11,6
2516,Joel Embiid,0.58,0.122991,2,7
2706,Rudy Gobert,0.008,0.120627,10,8
2756,Trae Young,0.0,0.108671,149,9
2795,Zion Williamson,0.0,0.106998,540,10


In [140]:
combination.sort_values(by="share", ascending=False).head(10)

Unnamed: 0,player,share,predicted_share,Rank,Predicted_rank
2655,Nikola Jokić,0.961,0.163042,1,2
2516,Joel Embiid,0.58,0.122991,2,7
2726,Stephen Curry,0.449,0.138977,3,4
2440,Giannis Antetokounmpo,0.345,0.176619,4,1
2333,Chris Paul,0.138,0.082006,5,21
2595,Luka Dončić,0.042,0.153657,6,3
2351,Damian Lillard,0.038,0.130474,7,5
2537,Julius Randle,0.02,0.100385,8,13
2384,Derrick Rose,0.01,0.031014,9,90
2706,Rudy Gobert,0.008,0.120627,10,8


In [141]:
def get_top_players(combination):
    """
    Get the top players based on actual share and predicted share.
    """
    actual = combination.sort_values(by="share", ascending=False).head(5)
    predicted = combination.sort_values(by="predicted_share", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["player"] in actual["player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1

    return sum(ps)/len(ps)

In [142]:
get_top_players(combination)

0.7119047619047618

In [143]:
valid_shares = final[final['share'].notnull() & (final['share'] > 0)]
min_season = valid_shares['season'].min()
print(f"Minimum season with valid mvp share: {min_season}")

years = list(range(min_season, 2025))




Minimum season with valid mvp share: 1956


In [144]:
aps = []
all_pred = []
for year in years[5:]:
    train = final[final['season'] < year]
    test = final[final['season'] == year]

    reg.fit(train[predictors], train["share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=["predicted_share"], index=test.index) 
    combination = pd.concat([test[["player", "share"]], predictions], axis=1)
    all_pred.append(combination)
    aps.append(get_top_players(combination))




In [145]:
sum(aps)/len(aps)

0.7340178835224459

In [146]:
def add_ranks(combination):
    """
    Add ranks to the predictions DataFrame.
    """
    combination = combination.sort_values(by="share", ascending=False)
    combination["Rank"] = list(range(1, combination.shape[0] + 1))
    combination = combination.sort_values(by="predicted_share", ascending=False)
    combination["Predicted_rank"] = list(range(1, combination.shape[0] + 1))
    combination["Difference"] = combination["Rank"] - combination["Predicted_rank"]
    return combination

In [147]:
ranking = add_ranks(all_pred[1])
ranking[ranking["Rank"]< 6].sort_values(by="Difference", ascending=False)

Unnamed: 0,player,share,predicted_share,Rank,Predicted_rank,Difference
24571,Wilt Chamberlain,0.358,0.387997,2,1,1
24471,Bill Russell,0.699,0.196909,1,2,-1
24538,Oscar Robertson,0.318,0.130589,3,5,-2
24501,Elgin Baylor,0.193,0.121643,4,6,-2
24522,Jerry West,0.141,0.108006,5,7,-2


In [148]:
def backtest(stats, model, year, predictors):
    """
    Backtest the model.
    """
    aps = []
    all_pred = []
    for year in years[5:]:
        train = stats[stats['season'] < year]
        test = stats[stats['season'] == year]

        model.fit(train[predictors], train["share"])
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predicted_share"], index=test.index) 
        combination = pd.concat([test[["player", "share"]], predictions], axis=1)
        all_pred.append(combination)
        aps.append(get_top_players(combination))
    return sum(aps)/len(aps), aps, pd.concat(all_pred)

In [149]:
mean_ap, aps, all_pred = backtest(final, reg, years[5:], predictors)
mean_ap

0.7340178835224459

In [150]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(by=0, ascending=False)

Unnamed: 0,0,1
0,0.000438,age
2,0.000233,g
23,0.00018,blk
30,0.00016,d_rtg
27,0.000143,w
22,0.00014,stl
16,0.000117,fta
21,9.7e-05,ast
26,9.6e-05,pts
20,9.6e-05,trb


In [151]:
final['Ntm'] = final['tm'].astype('category').cat.codes
final['Npos'] = final['pos'].astype('category').cat.codes

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

mean_ap, aps, all_pred = backtest(final, rf, years[64:], predictors)



In [None]:
mean_ap