In [1]:
%pip install html5lib lxml beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os #interacts with the operating systems ( such as creating folders/checking files)
import pandas as pd #used to read and clean the data 
import shutil #copying/moving files or directories

from bs4 import BeautifulSoup as bs #extract info such as player stats from a table.
import requests #requests usd to get HTML pages
import warnings; warnings.filterwarnings("ignore") #suppress python warnings, may remove this though
import time #helps overloading 
# unhide all rows and columns #below adjusts how the data is displayed.
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)



In [3]:
url = "https://www.basketball-reference.com/awards/mvp.html#mvp_NBA" #this is the web page we want to scrape
data = requests.get(url) #sends a GET request to the server and gets the pages html, and the content from the html page is stored now in data 
soup = bs(data.text, "html.parser") #converts the text into smth that is searchable now


table = soup.find('table', id='mvp_NBA') 


#mvp_df = pd.read_html(str(table), header=1)[0]
if table is None:
    mvp_df = pd.read_html(url, attrs={'id': 'mvp_NBA'}, header=1, flavor="lxml")[0]
else:
    mvp_df = pd.read_html(str(table), header=1, flavor="lxml")[0]

mvp_df.columns = [col.strip() for col in mvp_df.columns] 

#mvp_df = mvp_df.dropna(subset=[mvp_df.columns[0]]) , test code
mvp_df = mvp_df[mvp_df['Season'].astype(str).str[:4].astype(int) >= 2015]

#mvp_df.columns = ['Seasons', 'Players'], test code

mvp_df = mvp_df.reset_index(drop=True)
mvp_df.index = mvp_df.index + 1 #now instead of starting at 0 it will start at 1

print (mvp_df)



     Season   Lg                   Player Voting  Age   Tm   G    MP   PTS   TRB   AST  STL  BLK    FG%    3P%    FT%    WS  WS/48
1   2024-25  NBA  Shai Gilgeous-Alexander    (V)   26  OKC  76  34.2  32.7   5.0   6.4  1.7  1.0  0.519  0.375  0.898  16.7  0.309
2   2023-24  NBA            Nikola JokiÄ    (V)   28  DEN  79  34.6  26.4  12.4   9.0  1.4  0.9  0.583  0.359  0.817  17.0  0.299
3   2022-23  NBA              Joel Embiid    (V)   28  PHI  66  34.6  33.1  10.2   4.2  1.0  1.7  0.548  0.330  0.857  12.3  0.259
4   2021-22  NBA            Nikola JokiÄ    (V)   26  DEN  74  33.5  27.1  13.8   7.9  1.5  0.9  0.583  0.337  0.810  15.2  0.296
5   2020-21  NBA            Nikola JokiÄ    (V)   25  DEN  72  34.6  26.4  10.8   8.3  1.3  0.7  0.566  0.388  0.868  15.6  0.301
6   2019-20  NBA    Giannis Antetokounmpo    (V)   25  MIL  63  30.4  29.5  13.6   5.6  1.0  1.0  0.553  0.304  0.633  11.1  0.279
7   2018-19  NBA    Giannis Antetokounmpo    (V)   24  MIL  72  32.8  27.7  12.5   

In [4]:
#print (mvp_df.info())
print (mvp_df.head())

    Season   Lg                   Player Voting  Age   Tm   G    MP   PTS   TRB  AST  STL  BLK    FG%    3P%    FT%    WS  WS/48
1  2024-25  NBA  Shai Gilgeous-Alexander    (V)   26  OKC  76  34.2  32.7   5.0  6.4  1.7  1.0  0.519  0.375  0.898  16.7  0.309
2  2023-24  NBA            Nikola JokiÄ    (V)   28  DEN  79  34.6  26.4  12.4  9.0  1.4  0.9  0.583  0.359  0.817  17.0  0.299
3  2022-23  NBA              Joel Embiid    (V)   28  PHI  66  34.6  33.1  10.2  4.2  1.0  1.7  0.548  0.330  0.857  12.3  0.259
4  2021-22  NBA            Nikola JokiÄ    (V)   26  DEN  74  33.5  27.1  13.8  7.9  1.5  0.9  0.583  0.337  0.810  15.2  0.296
5  2020-21  NBA            Nikola JokiÄ    (V)   25  DEN  72  34.6  26.4  10.8  8.3  1.3  0.7  0.566  0.388  0.868  15.6  0.301


In [5]:
avg_age = mvp_df['Age'].mean() #this calculates the average age of the mvps since 2015
print ("Average age of the MVP since the 2015 season -> ", avg_age) #this prints it out


Average age of the MVP since the 2015 season ->  26.5


In [6]:
team_data = []

for year in range(2015,2026):
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}_standings.html"
    data = requests.get(url)
    soup = bs(data.text, "html.parser")
    tables = soup.find_all("table")
    for table in tables:
        df = pd.read_html(str(table))[0]
        df["Year"] = year
        team_data.append(df)

    #^that above collected the team data from 2015 to current, including standings.

teams_df = pd.concat(team_data,ignore_index=True)

#cleaning and merging the east/west standings into one table down below~

stat_cols_all = ["W", "L", "W/L%", "GB", "PS/G", "PA/G", "SRS"]
stat_cols = [x for x in stat_cols_all if x in teams_df.columns] # it loops over the columns for whatever of the columns are found

base_cols = ["Year"] + stat_cols #makes the base columns = to years and the statistics found

print (stat_cols, base_cols)

cols =["Eastern Conference"] + base_cols
east = teams_df[cols].copy()
east = east.dropna( subset=["Eastern Conference"])
var_div = east["Eastern Conference"].astype(str).str.contains("Division", na=False)
east = east[~var_div]
east = east.rename(columns={"Eastern Conference": "Teams"})
east["Conference"]= "East"
print(east.head(),east.shape)

cols = ["Western Conference"] + base_cols
west = teams_df[cols].copy()
west = west.dropna(subset=["Western Conference"])
var_div = west["Western Conference"].astype(str).str.contains("Division", na=False)
west = west[~var_div]
west = west.rename(columns={"Western Conference": "Teams"})
west["Conference"]= "West"
print(west.head(),west.shape)

teams_tidy = pd.concat([east, west], ignore_index= True)
print (teams_tidy.shape)

east = east.rename(columns={'Teams':'Team'}) #renaming the teams in the columns from Teams to Team
west = west.rename(columns={'Teams':'Team'})

def clean_team_name(t):
    if pd.isna(t):
        return t #means if its empty return it back but if not then ->
    
    return str(t).replace('x','').replace('*','').replace('z','').replace('y','').strip()     #this cleans the Teams and removes the uneccesary stuff such as : stars, extra letters etc.


east['Team'] = east['Team'].apply(clean_team_name) #we call the clean function and this cleans whatever we apply it top 
west['Team'] = west['Team'].apply(clean_team_name)

df = pd.concat([east,west], ignore_index=True)

for col in ["W","L","GB","PS/G","PA/G","SRS"]:
    if col in df.columns:
        df[col]= pd.to_numeric(df[col], errors = "coerce")

df["W/L%"] = pd.to_numeric(df["W/L%"].astype(str).str.replace("-", ""), errors="coerce")

df["ConferenceRank"]=(
    df.groupby(["Year","Conference"])["W/L%"]
    .rank(method="min", ascending=False)
)

print (df.shape)
print (df.head())
print (df.dtypes)
print (df["Team"].nunique())

#df = pd.concat([east,west], ignore_index=True)


['W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS'] ['Year', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']
                Teams  Year   W   L  W/L%    GB   PS/G   PA/G    SRS Conference
1    Toronto Raptors*  2015  49  33  .598     —  104.0  100.9   2.45       East
2     Boston Celtics*  2015  40  42  .488   9.0  101.4  101.2  -0.40       East
3      Brooklyn Nets*  2015  38  44  .463  11.0   98.0  100.9  -3.13       East
4  Philadelphia 76ers  2015  18  64  .220  31.0   92.0  101.0  -9.04       East
5     New York Knicks  2015  17  65  .207  32.0   91.9  101.2  -9.50       East (315, 10)
                      Teams  Year   W   L  W/L%    GB   PS/G   PA/G    SRS Conference
19  Portland Trail Blazers*  2015  51  31  .622     —  102.8   98.6   4.41       West
20    Oklahoma City Thunder  2015  45  37  .549   6.0  104.0  101.8   2.47       West
21                Utah Jazz  2015  38  44  .463  13.0   95.1   94.9   0.71       West
22           Denver Nuggets  2015  30  52  .366  21.0  101.5  

In [7]:
#merge the MVP data and the standings

mvp_df["Year"] = (mvp_df["Season"].astype(str).str[:4].astype(int) + 1)

team_map = {"DEN": "Denver Nuggets", 
    "PHI": "Philadelphia 76ers",  
    "MIL": "Milwaukee Bucks", 
    "HOU": "Houston Rockets",
    "OKC": "Oklahoma City Thunder",
    "GSW": "Golden State Warriors",
    "CLE": "Cleveland Cavaliers",
    "LAL": "Los Angeles Lakers",
    "DAL": "Dallas Mavericks",
    "BOS": "Boston Celtics",
    "MIA": "Miami Heat",
    "PHX": "Phoenix Suns",
    "NYK": "New York Knicks",
    "SAS": "San Antonio Spurs",
    "TOR": "Toronto Raptors",
    "BKN": "Brooklyn Nets",
    "ORL": "Orlando Magic",
    "WIZ": "Washington Wizards",
    "MAVS": "Dallas Mavericks",
    "MEM": "Memphis Grizzlies",
    "UTH": "Utahd Jazz",
    "ATL": "Atlanta Hawks",
    "NOP": "New Orleans Pelicans",
    "IND": "Indiana Pacers",
    "DET": "Detroit Pistons",
    "LAC": "Los Angeles Clippers",
    "POR": "Portland Trailblazers",
    "CHA": "Charlotte Hornets",
    "CHI": "Chicago Bulls",
    "SAC": "Sacramento Kings"}

mvp_df ["Team"] = (
    mvp_df["Tm"]
        .map(team_map)
        .fillna(mvp_df["Tm"])
)

mvp_keep = mvp_df[["Player", "Age", "Year", "Team"]]
standings_keep = df[["Year", "Team", "W/L%", "Conference", "ConferenceRank", "SRS"]]

merged = pd.merge(
    mvp_keep, standings_keep, on=["Year", "Team"], how="left"
)

print(merged.head()[["Player", "Year", "Team", "W/L%", "Conference", "ConferenceRank"]])

if "ConferenceRank" in merged.columns:
    merged["ConferenceRank"] = merged["ConferenceRank"].astype("Int64")

print(merged.shape)
print(merged.columns.tolist())
print(merged.head(10))
print(merged.isna().sum())

#soup = bs(data.text.encode("utf-8"), "html.parser")#


                    Player  Year                   Team   W/L% Conference  ConferenceRank
0  Shai Gilgeous-Alexander  2025  Oklahoma City Thunder    NaN        NaN             NaN
1            Nikola JokiÄ  2024         Denver Nuggets  0.695       West             1.0
2            Nikola JokiÄ  2024         Denver Nuggets  0.695       West             1.0
3              Joel Embiid  2023     Philadelphia 76ers  0.659       East             5.0
4              Joel Embiid  2023     Philadelphia 76ers  0.659       East             5.0
(18, 8)
['Player', 'Age', 'Year', 'Team', 'W/L%', 'Conference', 'ConferenceRank', 'SRS']
                    Player  Age  Year                   Team   W/L% Conference  ConferenceRank   SRS
0  Shai Gilgeous-Alexander   26  2025  Oklahoma City Thunder    NaN        NaN            <NA>   NaN
1            Nikola JokiÄ   28  2024         Denver Nuggets  0.695       West               1  5.23
2            Nikola JokiÄ   28  2024         Denver Nuggets  0.695 

In [8]:
print("Average Win to Loss % of the MVP's team:" , merged["W/L%"].mean())
print("Average Conference position: ", merged["ConferenceRank"].mean())
print("Average point differential of team: ", merged["SRS"].mean())

print("\nConference Rank distribution: ")
print(merged["ConferenceRank"].value_counts().sort_index())

#rank_counts = merged["ConferenceRank"].value_counts().sort_index()

#print("MVP count by Conference Rank: \n")
#for rank , count in rank_counts.items():
    #print(f"Conference Rank {rank}: {count} MVPs")
                                      
merged["Top5"] = merged["ConferenceRank"] <= 5
print("\n% of MVPs from Top 5 teams:", merged["Top5"].mean() * 100)

Average Win to Loss % of the MVP's team: 0.7217500000000001
Average Conference position:  3.25
Average point differential of team:  6.5775

Conference Rank distribution: 
ConferenceRank
1     10
5      4
11     2
Name: count, dtype: Int64

% of MVPs from Top 5 teams: 87.5


In [9]:
#now we use machine to rank who has the best chance of winning mvp for the upcoming season and rank the top 10.

target_year = df["Year"].max() #finds the maximum year of the years we have, since mvp data is most helpful when recent

url = f"https://www.basketball-reference.com/leagues/NBA_2026_per_game.html"
data_players = requests.get(url)
soup = bs(data_players.text, "html.parser")

player_df  = pd.read_html(str(soup), header = 0)[0]

#print (player_df.columns)
#print (player_df.head(15))

stat_cols = ["PTS", "AST", "TRB", "FG%", "BLK", "STL"]

player_df = player_df[player_df["Player"].notna()].copy()
player_df = player_df[player_df["Player"] != "Player"].copy()

player_df = player_df[~player_df["Team"].astype(str).str.contains("TM")].copy()

print (player_df.columns)
print (player_df.head(15))

for col in stat_cols:
    if col in player_df.columns:
        player_df[col] = pd.to_numeric(player_df[col], errors = "coerce")

print (player_df[stat_cols].dtypes)

player_df = (
    player_df.sort_values("G", ascending=False).groupby("Player", as_index=False).head(1) #this code fixes it
)

print (player_df[player_df["Player"].str.contains("Luka", na=False)]) #this fixes the luka problem where luka is printed 2x

Index(['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards'], dtype='object')
      Rk                   Player   Age Team Pos     G    GS    MP    FG   FGA    FG%   3P   3PA    3P%    2P   2PA    2P%   eFG%   FT   FTA    FT%  ORB  DRB   TRB   AST  STL  BLK  TOV   PF   PTS  Awards
0    1.0            Luka DonÄiÄ  26.0  LAL  PG  30.0  30.0  36.5  10.5  22.6  0.465  3.3  10.2  0.320   7.2  12.4  0.583  0.537  9.4  11.9  0.789  0.9  7.1   8.0   8.7  1.7  0.5  4.5  2.6  33.6     NaN
1    2.0  Shai Gilgeous-Alexander  27.0  OKC  PG  38.0  38.0  33.3  10.9  19.9  0.547  2.0   5.1  0.396   8.9  14.8  0.599  0.597  8.1   9.2  0.888  0.4  4.0   4.5   6.4  1.4  0.7  1.9  2.1  31.9     NaN
2    3.0             Tyrese Maxey  25.0  PHI  PG  36.0  36.0  39.6  10.8  22.4  0.480  3.8   9.2  0.409   7.0  13.3  0.528  0.563  5.7   6.5  0.876  

In [10]:
target_year = 2026
teams_current = df[df["Year"] == target_year].copy()

name_to_abbr = {full: abbr for abbr, full in team_map.items()}
teams_current["Team_abbr"] = teams_current["Team"].map(name_to_abbr)

players_df = player_df.merge(
    teams_current[["Team_abbr", "W/L%"]],
    left_on = "Team", right_on = "Team_abbr", how = "left"
)

players_df = players_df.rename(columns={"W/L%": "team_win_pct"})

print(players_df[["Player", "Team", "team_win_pct"]].head(20))

                   Player Team  team_win_pct
0          Jeremiah Fears  NOP           NaN
1             Jamal Shead  TOR           NaN
2             Gradey Dick  TOR           NaN
3           DeMar DeRozan  SAC           NaN
4             Deni Avdija  POR           NaN
5            Desmond Bane  ORL           NaN
6          Scottie Barnes  TOR           NaN
7            Quinten Post  GSW           NaN
8          Toumani Camara  POR           NaN
9           Julius Randle  MIN           NaN
10             Tyus Jones  ORL           NaN
11            Rudy Gobert  MIN           NaN
12          Naji Marshall  DAL           NaN
13            Derik Queen  NOP           NaN
14          Anthony Black  ORL           NaN
15      Russell Westbrook  SAC           NaN
16  Sandro Mamukelashvili  TOR           NaN
17          Jarace Walker  IND           NaN
18               Jay Huff  IND           NaN
19             Sion James  CHO           NaN


In [11]:
stat_cols = ["PTS", "AST", "TRB", "STL", "BLK", "FG%", "team_win_pct"]

print (players_df.columns.tolist())

for col in stat_cols:
    col_min = players_df[col].min()
    col_max = players_df[col].max()

    if col_min == col_max:
        players_df[col +"_norm"] = 0.5
    else:
        players_df[col +"_norm"] = (players_df[col] - col_min) / (col_max - col_min)

    print(player_df.columns)

['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards', 'Team_abbr', 'team_win_pct']
Index(['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards'], dtype='object')
Index(['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards'], dtype='object')
Index(['Rk', 'Player', 'Age', 'Team', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Awards'], dtype='object')
Index(['Rk', 'Player', 'Age', 'Team', 

In [12]:
players_df["Player"] = (
    players_df["Player"].str.encode("latin1").str.decode("utf-8"))

players_df = players_df.drop_duplicates(subset=["Player"])

players_df["mvp_score"] =( 
 #this is the formula 
    0.20 * players_df["PTS_norm"] +
    0.30 * players_df["team_win_pct_norm"] +
    0.15 * players_df["AST_norm"] +
    0.025 * players_df["STL_norm"] +
    0.10 * players_df["TRB_norm"] +
    0.025 * players_df["BLK_norm"] +
    0.20 * players_df["FG%_norm"] 

)


players_df = players_df.sort_values("mvp_score", ascending=False)

print(
    players_df[["Player", "Team", "PTS", "AST", "TRB", "team_win_pct", "mvp_score"]]
.head(10)
)

           Player Team   PTS  AST  TRB  team_win_pct  mvp_score
0  Jeremiah Fears  NOP  14.4  3.1  3.6           NaN        NaN
1     Jamal Shead  TOR   7.0  5.4  1.7           NaN        NaN
2     Gradey Dick  TOR   6.0  0.7  1.9           NaN        NaN
3   DeMar DeRozan  SAC  19.0  3.9  3.3           NaN        NaN
4     Deni Avdija  POR  26.1  6.9  7.1           NaN        NaN
5    Desmond Bane  ORL  19.2  4.5  4.6           NaN        NaN
6  Scottie Barnes  TOR  19.3  5.3  8.3           NaN        NaN
7    Quinten Post  GSW   8.1  1.5  3.9           NaN        NaN
8  Toumani Camara  POR  13.1  2.7  5.2           NaN        NaN
9   Julius Randle  MIN  22.0  5.7  7.1           NaN        NaN
