In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Rebound Predictor
The following is a Linear Regression model designed to predict the (x, y) location of a rebound on [the court](img/bball_court_north.png) given the (x, y) coordinates of a given shot. The data is trained and tested on [data/all_data_cleaned.csv](data/all_data_cleaned.csv), which was hand-created from observing shots and rebounds during amateur basketball games.

The objective of this model is to build a user-friendly app that allows a user to click anywhere on a court, to represent a shot, and have the model display the best possible place for a player to position themself for a rebound.

In [3]:
# Load the CSV
df = pd.read_csv("data/all_data_enhanced.csv")

In [4]:
# Correlation
correlation_cols = ['shot_x', 'shot_y', 'shot_distance_feet', 'rebound_x', 'rebound_y']
correlation_matrix = df[correlation_cols].corr()

correlation_matrix

Unnamed: 0,shot_x,shot_y,shot_distance_feet,rebound_x,rebound_y
shot_x,1.0,-0.089852,0.014989,-0.035064,0.038844
shot_y,-0.089852,1.0,-0.758828,0.032679,0.268957
shot_distance_feet,0.014989,-0.758828,1.0,-0.018046,-0.278209
rebound_x,-0.035064,0.032679,-0.018046,1.0,0.066753
rebound_y,0.038844,0.268957,-0.278209,0.066753,1.0


In [5]:
# Extract the two inputs (shot_x, shot_y) and represent them as 'X'
X = df[['shot_x', 'shot_y']]

# Extract the two outputs (rebound_x, rebound_y) and represent them as 'y'
y = df[['rebound_x', 'rebound_y']]

In [6]:
# Split the dataset into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [7]:
# Create a linear regression model
model = LinearRegression()

In [8]:
# Train the model
model.fit(X_train, y_train)

In [9]:
# Predict rebound positions using the test data
y_pred = model.predict(X_test)

In [10]:
# Calculate R-squared score
r2 = r2_score(y_test, y_pred)

print("R-squared score:", r2)

R-squared score: 0.043805186592980216


#### `.04` is not a really good R-squared score. It should ideally be close to 1. 😔

#### Let's play around with the strongest correlated values to get the best R-square for `rebound_x` and `rebound_y`, separately. `rebound_y` looks to have stronger correlations, so I'll look for that first:

In [11]:
# Extract the two most correlated inputs as 'X'. This is a trial and error step
X = df[['shot_y', 'shot_distance_feet']]

# Extract the output, rebound_y, and represent it as 'y'
y = df[['rebound_y']]

In [12]:
# Split the dataset into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Create a linear regression model
rebound_y_model = LinearRegression()

# Train the model
rebound_y_model.fit(X_train, y_train)

# Predict rebound_y using the test data
predicted_rebound_y = rebound_y_model.predict(X_test)

# Calculate R-squared score
r2 = r2_score(y_test, predicted_rebound_y)

print("R-squared score:", r2)

R-squared score: 0.088673953215754


#### `.09` is the best predictor I can get! It happened when I combined `shot_y` and `shot_distance_feet`- the two strongest correlated inputs

### Let's try to find `rebound_x`:

In [13]:
# Extract the two most correlated inputs as 'X'. This is a trial and error step
X = df[['shot_y', 'shot_distance_feet']]

# Extract the output, rebound_x, and represent it as 'y'
y = df[['rebound_x']]

In [14]:
# Split the dataset into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Create a linear regression model
rebound_x_model = LinearRegression()

# Train the model
rebound_x_model.fit(X_train, y_train)

# Predict rebound_x using the test data
predicted_rebound_x = rebound_x_model.predict(X_test)

# Calculate R-squared score
r2 = r2_score(y_test, predicted_rebound_x)

print("R-squared score:", r2)

R-squared score: 0.0012471764223208615


### Hm, well that sucks 😕

#### The fact that none of the numeric values have too strong of a correlation makes me think there's either too much variance in rebounding data, or I do not have sufficient data for my project. 

#### Nevertheless, let's build a quick formula to insert an (x, y) coordinate to get the predicted location. Something tells me the results will be quite boring, as it will likely just place me around the hoop.

In [15]:
def calculate_distance_in_feet(shot_x, shot_y, hoop_x=306, hoop_y=547):
    '''Returns the distance, in feet, of a shot coordinate to hoop coordinate
    This is a helper function for the next step'''
    def distance(x1, y1, x2, y2):
        return np.sqrt((x2 - x1)**2 + (y2 - y1)**2)
    
    # Calculate the pixel distance of the shot from the hoop
    pixel_distance = distance(shot_x, shot_y, hoop_x, hoop_y)
    
    # Convert pixel distance to feet using the provided pixel-to-foot ratio
    distance_in_feet = pixel_distance / 14.066666666666666
    
    return distance_in_feet

In [16]:
def predict_rebound(shot_x, shot_y):
    '''Given a shot_x integer and a shot_y integer returns a tuple of two integers, representing rebound_x and rebound_y'''

    shot_distance_feet = calculate_distance_in_feet(shot_x, shot_y)
    
    # Rebound X
    rebound_x_pred = rebound_x_model.predict(np.array([[shot_y, shot_distance_feet]]))
    rebound_x = round(rebound_x_pred[0][0])

    # Rebound Y
    rebound_y_pred = rebound_y_model.predict([[shot_y, shot_distance_feet]])
    rebound_y = round(rebound_y_pred[0][0])
    
    return (rebound_x, rebound_y)

In [17]:
predict_rebound(82, 273)

(302, 489)

#### And here you have it:

![](img/the_big_prediction.png)

After multiple run throughs, most of them are just showing rebound predictions in the middle of the paint. That makes sense, because I couldn't find a strong enough correlation with any inputs and outputs in the data, so the model biased towards the mean center of the rebound data.

Before I make any final conclusions, I should get more data and train it on the model in the future!

### Just for fun, let's see what correlation there is for the NBA data from Patreon:

In [22]:
# Load the CSV
nba_df = pd.read_csv("data/nba-tracking-rebounds.csv")

In [23]:
# Correlation
nba_df.describe()

Unnamed: 0,PossessionNumber,PlayerId,Time,Period,EventNumber,GameId,TeamId,ShotEventNum,StartTime,TeamId.1,OppTeamId,ReboundX,ReboundY,ShotDistance,ShotX,ShotY,ShooterPlayerId,ShotBlockPlayerId,ReboundDistance
count,7164.0,7164.0,7164.0,7164.0,7164.0,7164.0,7164.0,7164.0,7164.0,7164.0,7164.0,7163.0,7163.0,7164.0,7164.0,7164.0,7164.0,7164.0,7163.0
mean,24.90536,1042781.0,349.515773,2.468035,325.838219,24536120.0,1610613000.0,324.73995,366.759771,1610613000.0,1610613000.0,-1.893201,43.042021,16.509492,-3.322166,108.64852,1070783.0,93389.47,6.795714
std,14.72808,704252.6,209.968383,1.135622,198.648342,6765073.0,8.417577,198.638643,210.252225,8.417577,8.316493,58.223145,43.546068,10.10209,127.02806,97.564647,697738.7,365551.1,5.021261
min,1.0,1713.0,0.0,1.0,8.0,21900000.0,1610613000.0,7.0,2.0,1610613000.0,1610613000.0,-240.0,-50.0,0.0,-244.0,-39.0,1713.0,0.0,0.0
25%,12.0,202683.0,167.0,1.0,153.0,21900310.0,1610613000.0,152.0,184.0,1610613000.0,1610613000.0,-34.0,20.0,5.0,-85.25,16.0,202696.0,0.0,3.6
50%,25.0,1626181.0,348.0,2.0,321.0,21900620.0,1610613000.0,320.0,365.0,1610613000.0,1610613000.0,0.0,30.0,22.3,0.0,79.0,1627741.0,0.0,5.0
75%,37.0,1628384.0,534.0,3.0,493.0,21900960.0,1610613000.0,492.0,550.0,1610613000.0,1610613000.0,30.0,50.0,25.5,68.0,206.0,1628369.0,0.0,8.7
max,62.0,1629750.0,708.0,6.0,803.0,41900220.0,1610613000.0,802.0,720.0,1610613000.0,1610613000.0,233.0,420.0,36.9,246.0,369.0,1630567.0,1629750.0,42.5


In [25]:
nba_correlation_cols = ['ShotX', 'ShotY', 'ShotDistance', 'ReboundX', 'ReboundY']
nba_correlation_matrix = nba_df[nba_correlation_cols].corr()

nba_correlation_matrix

Unnamed: 0,ShotX,ShotY,ShotDistance,ReboundX,ReboundY
ShotX,1.0,0.001202,-0.026774,-0.011687,-4.5e-05
ShotY,0.001202,1.0,0.732829,0.002156,0.214942
ShotDistance,-0.026774,0.732829,1.0,0.019366,0.205237
ReboundX,-0.011687,0.002156,0.019366,1.0,0.00966
ReboundY,-4.5e-05,0.214942,0.205237,0.00966,1.0


Looks like there's not a lot of correlation in here either. Maybe, I'll need to bring in more advanced features like the spin and arc and fatigue, to make a better prediction model