In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [16]:
df_all = pd.read_csv('CleanedData.csv', index_col=0)
print(df_all.shape)
df_all.head()

(9151, 5)


Unnamed: 0_level_0,Season,ThreePointers,ThreePointersAttempted,ThreePointersMissed,ThreePointersMadePct
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A.J. Hammons,2017.0,5,10,5,0.5
A.J. Lawson,2023.0,10,25,15,0.4
A.J. Lawson,2024.0,12,36,24,0.333333
A.J. Price,2010.0,60,174,114,0.344828
A.J. Price,2011.0,48,165,117,0.290909


In [17]:
# historical 3 point contest results
df_historical = pd.read_csv('3PtHistoricalResults.csv')

# fill NA with 0s
df_historical = df_historical.fillna(0)
df_historical.head()

Unnamed: 0,name,made,att,dewmade,dewatt
0,Seth Curry,13,25,0.0,0.0
1,Stephen Curry,173,225,3.0,4.0
2,Buddy Hield,91,125,0.0,8.0
3,Damian Lillard,45,75,3.0,4.0
4,Kyle Korver,14,25,0.0,0.0


In [18]:
# Generate a player's 'ability' to make a 3 pointer

df = df_all.copy()
players = df.index.unique()
len(players)

1961

In [19]:
# Initialize dictionary to store player metrics
player_ability_dict = {
    "Player": [],         # Player identifier
    "Avg_Weighted": [],   # Weighted average of 3P% (recent seasons weighted more)
    "Avg_Not_Weighted": []# Simple average of all 3P% values
}

In [20]:
# Process each player's historical data
for player in players:
    # Extract player-specific data using index. .loc selects rows where index matches player and can also be used to select columns.
    df_player = df.loc[df.index == player]
    
    player_ability_dict['Player'].append(player)
    
    # Calculate season count and determine window size
    num_seasons = len(df_player['Season'].unique())
    max_season = 4 if num_seasons > 4 else num_seasons  # Use up to 4 most recent seasons
    
    # --- Weighted Average Calculation ---
    # Create linear weights [1, 2, ..., n] where recent seasons get higher weights. Can use decay weights as well.
    weights_straight = np.arange(1, max_season + 1)
    
    # Get most recent n seasons' 3P% (assumes dataframe is sorted chronologically)
    three_pointers = df_player[-max_season:]['ThreePointersMadePct']
    
    # Calculate weighted average using numpy's weighted mean function
    weighted_avg = np.average(np.array(three_pointers), weights=weights_straight)
    player_ability_dict['Avg_Weighted'].append(weighted_avg)
    
    # --- Non-Weighted Average Calculation ---
    # Calculate simple average of all available seasons
    no_weight_avg = np.average(np.array(df_player['ThreePointersMadePct']))
    player_ability_dict['Avg_Not_Weighted'].append(no_weight_avg)
    
# Convert results dictionary to dataframe
player_ability_df = pd.DataFrame.from_dict(player_ability_dict)
print(player_ability_df.shape)
player_ability_df.head()

(1961, 3)


Unnamed: 0,Player,Avg_Weighted,Avg_Not_Weighted
0,A.J. Hammons,0.5,0.5
1,A.J. Lawson,0.355556,0.366667
2,A.J. Price,0.286627,0.302846
3,AJ Green,0.401978,0.406245
4,AJ Griffin,0.311805,0.331344


In [21]:
# Reset the index of df_all to convert the index into a column (useful for merging later)
df_reset = df_all.reset_index()

# Filter rows where the season is 2024 and select only 'Name' and 'ThreePointersMadePct'
df_2024 = df_reset.loc[df_reset['Season'] == 2024][['Name', 'ThreePointersMadePct']]

# Rename columns for clarity and consistency
df_2024 = df_2024.rename(columns={'Name': 'Player', 'ThreePointersMadePct': '2024_Avg_Pct'})

# Convert the player_ability_dict (from earlier code) into a DataFrame
df_ability = pd.DataFrame.from_dict(player_ability_dict)

# Merge df_ability with df_2024 on the 'Player' column
# This adds the 2024 3P% to each player's ability metrics
df_ability = pd.merge(df_ability, df_2024, on='Player', how='inner')

df_ability

Unnamed: 0,Player,Avg_Weighted,Avg_Not_Weighted,2024_Avg_Pct
0,A.J. Lawson,0.355556,0.366667,0.333333
1,AJ Green,0.401978,0.406245,0.393443
2,AJ Griffin,0.311805,0.331344,0.272727
3,Aaron Gordon,0.323808,0.317033,0.293333
4,Aaron Holiday,0.401642,0.385910,0.406504
...,...,...,...,...
498,Zach Collins,0.333136,0.334826,0.290323
499,Zach LaVine,0.371829,0.374655,0.349112
500,Zeke Nnaji,0.339148,0.363858,0.312500
501,Ziaire Williams,0.295876,0.295684,0.314103


In [22]:
# Each player's actual 3 point make % from historical 3 point contest data
df_historical['Actual'] = df_historical['made'] / df_historical['att']
df_historical['dewActual'] = df_historical['dewmade'] / df_historical['dewatt']
df_historical = df_historical.fillna(0)
df_historical.head()

Unnamed: 0,name,made,att,dewmade,dewatt,Actual,dewActual
0,Seth Curry,13,25,0.0,0.0,0.52,0.0
1,Stephen Curry,173,225,3.0,4.0,0.768889,0.75
2,Buddy Hield,91,125,0.0,8.0,0.728,0.0
3,Damian Lillard,45,75,3.0,4.0,0.6,0.75
4,Kyle Korver,14,25,0.0,0.0,0.56,0.0


In [None]:
# Model each player's 3 point make ability specifically in the 3 point contest

# let's use the weighted average
df_global_ability = df_ability[['Player', 'Avg_Weighted']]
df_global_ability = df_global_ability.rename(columns={'Avg_Weighted': 'Make3Perc'})

# only take the global abilities from players who have played in the 3 point contest previously
df_global_players = df_global_ability[df_global_ability['Player'].isin(df_historical['name'].tolist())]
df_global_players = df_global_players.fillna(0)

print(df_global_players.shape)
print(df_historical.shape)

# there are 5 players who have played in the 3 point contest who we don't seem to have any data for 
no_data_players = [p for p in df_historical['name'].tolist() if p not in df_global_players['Player'].tolist()]
print(no_data_players)

# none of these players are ones who are competing in the upcoming 3 point contest so we can ignore
df_historical = df_historical[~df_historical['name'].isin(no_data_players)]
df_global_players = df_global_players[~df_global_players['Player'].isin(no_data_players)]

print(df_global_players.shape)
print(df_historical.shape)