# Model

In [1]:
# import statements

import os
import io
import pandas as pd
import numpy as np
import difflib
from datetime import datetime, timedelta
import statistics

import gspread
from df2gspread import df2gspread as d2g
from df2gspread import gspread2df as g2d
from oauth2client.service_account import ServiceAccountCredentials

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

from sklearn.linear_model import BayesianRidge

from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn import metrics
from scipy.stats import norm
from sklearn.preprocessing import OneHotEncoder

## Data Cleaning + Manipulation

In [2]:
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
json_file_name = os.getcwd() + "/sportsbetting-376321-f2ada03a7020.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(json_file_name, scope)
gc = gspread.authorize(credentials)
spreadsheet_key = '1qZfM3myJ9naCc_AT7U20Zjv5UPMEa20sSTVuG7HlNNc'

### Betting Data

In [3]:
#### Load Data

wks_name = "BettingPros"
betting_df = g2d.download(gfile=spreadsheet_key, wks_name=wks_name, col_names=True, row_names=True, credentials=credentials)
betting_df = betting_df.reset_index()
print(betting_df.shape)

(7301, 4)


In [4]:
#### Cleaning

# drop index column(s)
betting_df = betting_df.drop(columns=['index'])

# rename Name to Player
betting_df = betting_df.rename(columns={"Name": "Player"})

# convert Line to float
betting_df = betting_df.astype({'Line': 'float'})

# drop na values
rows_to_drop = []
for index,row in betting_df.iterrows():
    for col in betting_df.columns:
        if(pd.isna(betting_df[col][index])):
            rows_to_drop.append(index)
betting_df = betting_df.drop(rows_to_drop)

In [5]:
#### Summary

print(betting_df.shape)
print(betting_df.dtypes)
betting_df.head(3)

(7301, 3)
Player     object
Line      float64
Date       object
dtype: object


Unnamed: 0,Player,Line,Date
0,Jayson Tatum,22.5,2022-10-18
1,Stephen Curry,23.5,2022-10-18
2,De'Anthony Melton,7.5,2022-10-18


### Player Game Data

In [6]:
#### Load Data

wks_name = "PlayerData"
player_df = g2d.download(gfile=spreadsheet_key, wks_name=wks_name, col_names=True, row_names=True, credentials=credentials)
player_df = player_df.reset_index()

print(player_df.shape)

(16139, 38)


In [7]:
#### Cleaning

# convert Home_Away into Boolean (Home=TRUE)
player_df['Home'] = player_df['Home_Away'] != '@'

# convert Result into Win, Team_PTS, Opp_PTS
player_df['Win'] = [s.split()[0] == 'W' for s in player_df['Result']]
player_df['Team_PTS'] = [int(s.split()[1][:s.index("-")-2]) for s in player_df['Result']]
player_df['Opp_PTS'] = [int(s.split()[1][s.index("-")-1:]) for s in player_df['Result']]

# rename columns
player_df = player_df.rename(columns={"PTS": "Player_PTS", 
                                      "MP": "Player_MP",
                                      "FG": "Player_FGM",
                                      "FGA": "Player_FGA",
                                      "FG%": "Player_FG%",
                                      "2P": "Player_2PM",
                                      "2PA": "Player_2PA",
                                      "2P%": "Player_2P%",
                                      "3P": "Player_3PM",
                                      "3PA": "Player_3PA",
                                      "3P%": "Player_3P%",
                                      "FT": "Player_FTM",
                                      "FTA": "Player_FTA",
                                      "FT%": "Player_FT%",
                                      "TS%": "Player_TS%",
                                      "ORB": "Player_ORB",
                                      "DRB": "Player_DRB",
                                      "TRB": "Player_TRB",
                                      "AST": "Player_AST",
                                      "STL": "Player_STL",
                                      "BLK": "Player_BLK",
                                      "TOV": "Player_TOV",
                                      "PF": "Player_PF",
                                      "GmSc": "Player_GmSc",
                                      "BPM": "Player_BPM"})

# change numerical column types
numerical_columns = ['Player_PTS', 'Player_MP',
       'Player_FGM', 'Player_FGA', 'Player_FG%', 'Player_2PM', 'Player_2PA',
       'Player_2P%', 'Player_3PM', 'Player_3PA', 'Player_3P%', 'Player_FTM',
       'Player_FTA', 'Player_FT%', 'Player_TS%', 'Player_ORB', 'Player_DRB',
       'Player_TRB', 'Player_AST', 'Player_STL', 'Player_BLK', 'Player_TOV',
       'Player_PF', 'Player_GmSc', 'Player_BPM', 'Team_PTS',
       'Opp_PTS']
for col in numerical_columns:
    player_df = player_df.astype({col: 'float'})
    
# create primary position column
player_df["Player_Pos"] = [s[0] for s in player_df['Pos.']]

# drop columns
columns_to_drop = ['index', 'Rk', 'Player-additional', 'Home_Away', 'Result', 'Age', 'GS','PTS.1',
                  'Pos.', 'Win', 'Team_PTS', 'Opp_PTS']
player_df = player_df.drop(columns=columns_to_drop)

In [8]:
#### Manipulation - keep track of average stats coming into game

stats = ['Player_PTS', 'Player_MP', 'Player_FGM', 'Player_FGA', 'Player_FG%', 'Player_2PM', 'Player_2PA',
       'Player_2P%', 'Player_3PM', 'Player_3PA', 'Player_3P%', 'Player_FTM',
       'Player_FTA', 'Player_FT%', 'Player_TS%', 'Player_ORB', 'Player_DRB',
       'Player_TRB', 'Player_AST', 'Player_STL', 'Player_BLK', 'Player_TOV',
       'Player_PF', 'Player_GmSc', 'Player_BPM']

players = player_df['Player'].unique()

# sort rows from beginning to end of season
player_df = player_df.sort_values(by=['Date'], ignore_index=True)

# sabe actual points scored in game
actual = [val for val in player_df['Player_PTS']]

# data structure to hold all data
player_dict = {player:{stat:[] for stat in stats} for player in players}

# keep column of player PPG standard deviation
player_ppg_sd = []

for index,row in player_df.iterrows():
    player = row['Player']
    ppg_sd = statistics.stdev(player_dict[player]['Player_PTS']) if len(player_dict[player]['Player_PTS']) > 1 else 0
    player_ppg_sd.append(ppg_sd) 
    for stat in stats:
        temp = row[stat]
        player_df.at[index,stat] = statistics.mean(player_dict[player][stat]) if len(player_dict[player][stat]) > 0 else 0
        if not pd.isna(temp): player_dict[player][stat].append(temp)

# add column for player PPG standard deviation
player_df['Player_PTSSTDEV'] = player_ppg_sd

# add back actual point totals for each game
player_df['Actual_PTS'] = actual

# rename Player_PTS to Player_PPG
player_df = player_df.rename(columns={"Player_PTS": "Player_PPG"})

In [9]:
#### Summary

print(player_df.shape)
print(player_df.dtypes)
player_df[-5:]

(16139, 33)
Player              object
Player_PPG         float64
Date                object
Team                object
Opp                 object
Player_MP          float64
Player_FGM         float64
Player_FGA         float64
Player_FG%         float64
Player_2PM         float64
Player_2PA         float64
Player_2P%         float64
Player_3PM         float64
Player_3PA         float64
Player_3P%         float64
Player_FTM         float64
Player_FTA         float64
Player_FT%         float64
Player_TS%         float64
Player_ORB         float64
Player_DRB         float64
Player_TRB         float64
Player_AST         float64
Player_STL         float64
Player_BLK         float64
Player_TOV         float64
Player_PF          float64
Player_GmSc        float64
Player_BPM         float64
Home                  bool
Player_Pos          object
Player_PTSSTDEV    float64
Actual_PTS         float64
dtype: object


Unnamed: 0,Player,Player_PPG,Date,Team,Opp,Player_MP,Player_FGM,Player_FGA,Player_FG%,Player_2PM,...,Player_STL,Player_BLK,Player_TOV,Player_PF,Player_GmSc,Player_BPM,Home,Player_Pos,Player_PTSSTDEV,Actual_PTS
16134,James Harden,21.40625,2023-01-30,PHI,ORL,36.78125,6.5625,14.59375,0.456781,3.75,...,1.25,0.5625,3.5,2.1875,20.653125,5.678125,True,G,6.598433,17.0
16135,Franz Wagner,19.918367,2023-01-30,ORL,PHI,33.061224,7.22449,14.673469,0.496939,5.530612,...,0.877551,0.204082,2.040816,2.163265,14.367347,0.253061,False,G,5.90493,19.0
16136,Matisse Thybulle,2.522727,2023-01-30,PHI,ORL,12.113636,1.0,2.340909,0.405719,0.590909,...,0.886364,0.318182,0.159091,1.363636,2.493182,-1.586364,True,G,3.267092,10.0
16137,Bradley Beal,22.034483,2023-01-30,WAS,SAS,33.068966,8.275862,16.206897,0.527069,6.793103,...,0.862069,0.586207,2.689655,1.793103,16.517241,2.244828,False,G,7.277572,21.0
16138,Jock Landale,6.904762,2023-01-30,PHO,TOR,14.595238,2.571429,4.952381,0.517775,2.166667,...,0.238095,0.52381,0.952381,1.904762,5.778571,-0.847619,True,C,4.64763,4.0


### Team Game Data

In [10]:
#### Load Data

wks_name = "TeamData"
team_df = g2d.download(gfile=spreadsheet_key, wks_name=wks_name, col_names=True, row_names=True, credentials=credentials)
team_df = team_df.reset_index()
print(team_df.shape)

(1526, 35)


In [11]:
#### Cleaning

# convert Home_Away into Boolean (Home=TRUE)
team_df['Home'] = team_df['Home_Away'] != '@'

# rename columns
team_df = team_df.rename(columns={"MP": "Team_MP",
                                     "FG": "Team_FGM",
                                     "FGA": "Team_FGA",
                                     "FG%": "Team_FG%",
                                     "2P": "Team_2PM",
                                     "2PA": "Team_2PA",
                                     "2P%": "Team_2P%",
                                     "3P": "Team_3PM",
                                     "3PA": "Team_3PA",
                                     "3P%": "Team_3P%",
                                     "FT": "Team_FTM",
                                     "FTA": "Team_FTA",
                                     "FT%": "Team_FT%",
                                     "PTS.1": "Team_PTS",
                                     "MP.1": "Opp_MP",
                                     "FG.1": "Opp_FGM",
                                     "FGA.1": "Opp_FGA",
                                     "FG%.1": "Opp_FG%",
                                     "2P.1": "Opp_2PM",
                                     "2PA.1": "Opp_2PA",
                                     "2P%.1": "Opp_2P%",
                                     "3P.1": "Opp_3PM",
                                     "3PA.1": "Opp_3PA",
                                     "3P%.1": "Opp_3P%",
                                     "FT.1": "Opp_FTM",
                                     "FTA.1": "Opp_FTA",
                                     "FT%.1": "Opp_FT%",
                                     "PTS.2": "Opp_PTS"})

# change numerical column types
numerical_columns = ['Team_MP', 'Team_FGM', 'Team_FGA', 'Team_FG%',
       'Team_2PM', 'Team_2PA', 'Team_2P%', 'Team_3PM', 'Team_3PA', 'Team_3P%',
       'Team_FTM', 'Team_FTA', 'Team_FT%', 'Team_PTS', 'Opp_FGM', 'Opp_FGA',
       'Opp_FG%', 'Opp_2PM', 'Opp_2PA', 'Opp_2P%', 'Opp_3PM', 'Opp_3PA',
       'Opp_3P%', 'Opp_FTM', 'Opp_FTA', 'Opp_FT%', 'Opp_PTS', ]
for col in numerical_columns:
    team_df = team_df.astype({col: 'float'})

# drop columns
columns_to_drop = ['index', 'Rk', 'Home_Away', 'PTS', 'Result']
team_df = team_df.drop(columns=columns_to_drop)

# add inverse rows
for index, row in team_df.iterrows():
    inverse_row = {'Team': row['Opp'],
                  'Date': row['Date'],
                  'Opp': row['Team'],
                  'Team_MP': row['Team_MP'],
                  'Team_FGM': row['Opp_FGM'],
                  'Team_FGA': row['Opp_FGA'],
                  'Team_FG%': row['Opp_FG%'],
                  'Team_2PM': row['Opp_2PM'],
                  'Team_2PA': row['Opp_2PA'],
                  'Team_2P%': row['Opp_2P%'],
                  'Team_3PM': row['Opp_3PM'],
                  'Team_3PA': row['Opp_3PA'],
                  'Team_3P%': row['Opp_3P%'],
                  'Team_FTM': row['Opp_FTM'],
                  'Team_FTA': row['Opp_FTA'],
                  'Team_FT%': row['Opp_FT%'],
                  'Team_PTS': row['Opp_PTS'],
                  'Opp_FGM': row['Team_FGM'],
                  'Opp_FGA': row['Team_FGA'],
                  'Opp_FG%': row['Team_FG%'],
                  'Opp_2PM': row['Team_2PM'],
                  'Opp_2PA': row['Team_2PA'],
                  'Opp_2P%': row['Team_2P%'],
                  'Opp_3PM': row['Team_3PM'],
                  'Opp_3PA': row['Team_3PA'],
                  'Opp_3P%': row['Team_3P%'],
                  'Opp_FTM': row['Team_FTM'],
                  'Opp_FTA': row['Team_FTA'],
                  'Opp_FT%': row['Team_FT%'],
                  'Opp_PTS': row['Team_PTS'],
                  'Home': not row['Home']}
    team_df = team_df.append(inverse_row, ignore_index=True)
    
# add boolean as float for team winning game
team_df['Win'] = (team_df['Team_PTS'] > team_df['Opp_PTS']).astype(float)

# drop repeated rows
team_df = team_df.drop_duplicates()

In [12]:
#### Manipulation - keep track of average stats coming into game

stats = ['Team_MP', 'Team_FGM', 'Team_FGA', 'Team_FG%',
       'Team_2PM', 'Team_2PA', 'Team_2P%', 'Team_3PM', 'Team_3PA', 'Team_3P%',
       'Team_FTM', 'Team_FTA', 'Team_FT%', 'Team_PTS', 'Opp_FGM', 'Opp_FGA',
       'Opp_FG%', 'Opp_2PM', 'Opp_2PA', 'Opp_2P%', 'Opp_3PM', 'Opp_3PA',
       'Opp_3P%', 'Opp_FTM', 'Opp_FTA', 'Opp_FT%', 'Opp_PTS', 'Win']

teams = team_df['Team'].unique()

# sort rows from beginning to end of season
team_df = team_df.sort_values(by=['Date'], ignore_index=True)

# data structure to hold all data
team_dict = {team:{stat:[] for stat in stats} for team in teams}

for index,row in team_df.iterrows():
    team = row['Team']
    for stat in stats:
        temp = row[stat]
        team_df.at[index,stat] = statistics.mean(team_dict[team][stat]) if len(team_dict[team][stat]) > 0 else 0
        if not pd.isna(temp): team_dict[team][stat].append(temp)  

In [13]:
#### Summary

print(team_df.shape)
print(team_df.dtypes)
team_df[-5:]

(1526, 32)
Team         object
Date         object
Opp          object
Team_MP     float64
Team_FGM    float64
Team_FGA    float64
Team_FG%    float64
Team_2PM    float64
Team_2PA    float64
Team_2P%    float64
Team_3PM    float64
Team_3PA    float64
Team_3P%    float64
Team_FTM    float64
Team_FTA    float64
Team_FT%    float64
Team_PTS    float64
Opp_FGM     float64
Opp_FGA     float64
Opp_FG%     float64
Opp_2PM     float64
Opp_2PA     float64
Opp_2P%     float64
Opp_3PM     float64
Opp_3PA     float64
Opp_3P%     float64
Opp_FTM     float64
Opp_FTA     float64
Opp_FT%     float64
Opp_PTS     float64
Home           bool
Win         float64
dtype: object


Unnamed: 0,Team,Date,Opp,Team_MP,Team_FGM,Team_FGA,Team_FG%,Team_2PM,Team_2PA,Team_2P%,...,Opp_2P%,Opp_3PM,Opp_3PA,Opp_3P%,Opp_FTM,Opp_FTA,Opp_FT%,Opp_PTS,Home,Win
1521,PHO,2023-01-30,TOR,241.960784,41.588235,89.921569,0.462549,28.705882,56.686275,0.506784,...,0.529922,11.254902,31.117647,0.358373,19.803922,25.176471,0.788137,111.568627,True,0.509804
1522,DAL,2023-01-30,DET,243.431373,39.078431,83.058824,0.47151,24.352941,42.431373,0.571373,...,0.553314,11.196078,31.333333,0.357765,19.607843,25.098039,0.780882,112.098039,True,0.509804
1523,MIN,2023-01-30,SAC,240.480769,42.5,86.269231,0.494808,30.769231,53.403846,0.580481,...,0.533019,13.096154,35.557692,0.371519,19.442308,25.884615,0.754615,114.961538,True,0.519231
1524,ORL,2023-01-30,PHI,241.0,40.06,84.6,0.47514,29.1,53.52,0.54616,...,0.57106,13.16,37.84,0.34468,18.2,23.3,0.78244,114.64,False,0.38
1525,SAC,2023-01-30,MIN,240.520833,43.083333,87.270833,0.494146,29.5,50.541667,0.583417,...,0.560937,11.520833,31.854167,0.363937,18.145833,22.729167,0.793813,116.625,False,0.5625


### Combine Data

In [14]:
# drop columns from team DataFrame
columns_to_drop = ['Opp_FGM', 'Opp_FGA', 'Opp_FG%', 'Opp_2PM', 'Opp_2PA', 'Opp_2P%', 'Opp_3PM', 'Opp_3PA', 
                   'Opp_3P%', 'Opp_FTM', 'Opp_FTA', 'Opp_FT%', 'Opp_PTS']
team_df = team_df.drop(columns=columns_to_drop)

# combine player and team data into DataFrame
on = ['Date', 'Team', 'Opp', 'Home']
df = pd.merge(player_df,team_df, on=on)

# add opponent average stats to DataFrame (x is the Player's Team, y is that team's Opp)
df = pd.merge(df, team_df, 
         left_on = ['Team', 'Date'], right_on = ['Opp', 'Date'], 
         how = 'left')

# fix Team/Home stuff
df = df.rename(columns={"Team_x": "Team", "Opp_x": "Opp", 'Home_x': 'Home', 
                        'Win_x': 'Win%_x', 'Win_y': 'Win%_y'})
df = df.drop(columns=['Team_y', 'Opp_y', 'Home_y'])

# add line data to DataFrame
on = ['Player', 'Date']
df = pd.merge(df,betting_df, on=on)

print(df.columns)
df[-5:]

Index(['Player', 'Player_PPG', 'Date', 'Team', 'Opp', 'Player_MP',
       'Player_FGM', 'Player_FGA', 'Player_FG%', 'Player_2PM', 'Player_2PA',
       'Player_2P%', 'Player_3PM', 'Player_3PA', 'Player_3P%', 'Player_FTM',
       'Player_FTA', 'Player_FT%', 'Player_TS%', 'Player_ORB', 'Player_DRB',
       'Player_TRB', 'Player_AST', 'Player_STL', 'Player_BLK', 'Player_TOV',
       'Player_PF', 'Player_GmSc', 'Player_BPM', 'Home', 'Player_Pos',
       'Player_PTSSTDEV', 'Actual_PTS', 'Team_MP_x', 'Team_FGM_x',
       'Team_FGA_x', 'Team_FG%_x', 'Team_2PM_x', 'Team_2PA_x', 'Team_2P%_x',
       'Team_3PM_x', 'Team_3PA_x', 'Team_3P%_x', 'Team_FTM_x', 'Team_FTA_x',
       'Team_FT%_x', 'Team_PTS_x', 'Win%_x', 'Team_MP_y', 'Team_FGM_y',
       'Team_FGA_y', 'Team_FG%_y', 'Team_2PM_y', 'Team_2PA_y', 'Team_2P%_y',
       'Team_3PM_y', 'Team_3PA_y', 'Team_3P%_y', 'Team_FTM_y', 'Team_FTA_y',
       'Team_FT%_y', 'Team_PTS_y', 'Win%_y', 'Line'],
      dtype='object')


Unnamed: 0,Player,Player_PPG,Date,Team,Opp,Player_MP,Player_FGM,Player_FGA,Player_FG%,Player_2PM,...,Team_2P%_y,Team_3PM_y,Team_3PA_y,Team_3P%_y,Team_FTM_y,Team_FTA_y,Team_FT%_y,Team_PTS_y,Win%_y,Line
6640,Cole Anthony,12.21875,2023-01-30,ORL,PHI,25.9375,4.25,9.6875,0.422187,3.09375,...,0.547187,12.875,33.458333,0.38125,20.0,24.208333,0.824771,114.791667,0.666667,10.5
6641,Gary Harris,9.090909,2023-01-30,ORL,PHI,23.772727,3.272727,6.409091,0.493682,1.545455,...,0.547187,12.875,33.458333,0.38125,20.0,24.208333,0.824771,114.791667,0.666667,7.5
6642,Wendell Carter Jr.,15.258065,2023-01-30,ORL,PHI,30.258065,5.612903,10.677419,0.52529,4.387097,...,0.547187,12.875,33.458333,0.38125,20.0,24.208333,0.824771,114.791667,0.666667,13.5
6643,Markelle Fultz,12.068966,2023-01-30,ORL,PHI,27.896552,5.034483,10.068966,0.490414,4.586207,...,0.547187,12.875,33.458333,0.38125,20.0,24.208333,0.824771,114.791667,0.666667,11.5
6644,Franz Wagner,19.918367,2023-01-30,ORL,PHI,33.061224,7.22449,14.673469,0.496939,5.530612,...,0.547187,12.875,33.458333,0.38125,20.0,24.208333,0.824771,114.791667,0.666667,17.5


In [15]:
# write combined data to Google Sheets

if (False):
    wks_name = 'CombinedData'
    d2g.upload(df, spreadsheet_key, wks_name, credentials=credentials, row_names=True)
    print(df.shape)

## Predictions

### Functions

In [16]:
# returns only selected features of X

def select_features(X, y):
    
    feature_selection_pipeline = Pipeline([
        ('scaler',StandardScaler()),
        ('model',Lasso())
    ])
    
    search = GridSearchCV(feature_selection_pipeline,
                          {'model__alpha':np.arange(0.1,10,0.1)},
                          cv = 5, scoring="neg_mean_squared_error",verbose=0
                         )
    search.fit(X, y)

    coefficients = search.best_estimator_.named_steps['model'].coef_
    importance = np.abs(coefficients)
    selected_features = np.array(X.columns)[importance > 0]
    print("Selected:", selected_features)

    unselected_features = np.array(X.columns)[importance == 0]
    print("Unselected:", unselected_features)
    print()

    return X[selected_features]

In [17]:
# return probability given line, estimate, and uncertainty

def regression_probability(line, estimate, uncertainty):
    
    z_score = abs(line-estimate) / uncertainty
    return (1-norm.sf(z_score))*100

### Data Preparation

In [18]:
#### Prepare Training/Testing Data

# remove data before a given date
df = df[df['Date'] >= '2022-11-18']

# drop na values

# shuffle DataFrame
df = df.sample(frac=1).reset_index(drop=True)

# X y split(s)
input_cols = ['Player_PPG', 'Player_MP',
       'Player_FGM', 'Player_FGA', 'Player_FG%', 'Player_2PM', 'Player_2PA',
       'Player_2P%', 'Player_3PM', 'Player_3PA', 'Player_3P%', 'Player_FTM',
       'Player_FTA', 'Player_FT%', 'Player_TS%', 'Player_ORB', 'Player_DRB',
       'Player_TRB', 'Player_AST', 'Player_STL', 'Player_BLK', 'Player_TOV',
       'Player_PF', 'Player_GmSc', 'Player_BPM', 'Home', 'Player_Pos',
       'Player_PTSSTDEV', 'Team_MP_x', 'Team_FGM_x', 'Team_FGA_x',
       'Team_FG%_x', 'Team_2PM_x', 'Team_2PA_x', 'Team_2P%_x', 'Team_3PM_x',
       'Team_3PA_x', 'Team_3P%_x', 'Team_FTM_x', 'Team_FTA_x', 'Team_FT%_x',
       'Team_PTS_x', 'Win%_x', 'Team_MP_y', 'Team_FGM_y', 'Team_FGA_y',
       'Team_FG%_y', 'Team_2PM_y', 'Team_2PA_y', 'Team_2P%_y', 'Team_3PM_y',
       'Team_3PA_y', 'Team_3P%_y', 'Team_FTM_y', 'Team_FTA_y', 'Team_FT%_y',
       'Team_PTS_y', 'Win%_y']

X = df[input_cols]
players = df['Player']
dates = df['Date']
lines = df['Line']
pts = df['Actual_PTS']
bets = pts > lines # TRUE=OVER

# OneHotEncoder: Home 
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(df[['Home']]).toarray())
X = X.join(enc_df)
X = X.drop(columns=['Home'])
X = X.rename(columns={0: "Away", 1: "Home"})

# OneHotEncoder: Player_Pos
enc = OneHotEncoder(handle_unknown='ignore')
enc_df = pd.DataFrame(enc.fit_transform(df[['Player_Pos']]).toarray())
X = X.join(enc_df)
X = X.drop(columns=['Player_Pos'])
X = X.rename(columns={0: "C", 1: "F", 2: "G"})

# Polynomial Features

# Scale Data 
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

X

Unnamed: 0,Player_PPG,Player_MP,Player_FGM,Player_FGA,Player_FG%,Player_2PM,Player_2PA,Player_2P%,Player_3PM,Player_3PA,...,Team_FTM_y,Team_FTA_y,Team_FT%_y,Team_PTS_y,Win%_y,Away,Home,C,F,G
0,-0.887231,1.181771,-1.000585,-0.626412,-1.049707,-1.445312,-1.141053,-2.562669,0.659165,0.443237,...,1.031044,0.994795,0.124415,-0.635290,-0.276571,1.020356,-1.020356,-0.372362,1.340818,-1.042515
1,0.960366,0.553463,0.952121,0.801345,0.240964,1.146916,1.102553,0.131322,-0.165715,-0.061599,...,-0.803243,0.308358,-2.091018,-0.271859,1.795247,-0.980050,0.980050,-0.372362,-0.745813,0.959219
2,0.506749,0.753963,0.410882,0.863280,-0.825987,0.495778,0.872138,-0.847383,-0.073199,0.380298,...,0.316368,-0.130804,1.150586,-0.215562,1.151648,-0.980050,0.980050,-0.372362,-0.745813,0.959219
3,-0.813748,1.154199,-0.926297,-0.606615,-0.836309,-1.384381,-1.117507,-1.936480,0.703938,0.446939,...,0.211577,0.224529,0.032969,-0.841808,0.175895,-0.980050,0.980050,-0.372362,1.340818,-1.042515
4,-0.216787,0.588532,0.073403,0.215932,-0.306310,0.276426,0.501679,-0.657278,-0.392689,-0.306109,...,-0.765357,-1.467445,1.332388,0.398799,0.755052,1.020356,-1.020356,-0.372362,-0.745813,0.959219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4561,-0.886671,-0.749325,-0.878541,-0.770976,-0.360663,-0.788285,-0.816117,0.894316,-0.392689,-0.286950,...,0.128315,-0.297685,0.898757,-1.582540,0.340428,-0.980050,0.980050,-0.372362,1.340818,-1.042515
4562,1.015531,1.173588,0.991175,1.185766,-0.449029,0.090784,0.122587,-0.205648,2.056733,2.044113,...,0.296415,0.217740,-0.067002,1.332408,-0.005091,-0.980050,0.980050,-0.372362,-0.745813,0.959219
4563,1.515845,1.349514,1.669740,1.759471,-0.405020,1.423039,1.133500,0.544989,0.898222,1.686470,...,0.523484,0.508675,0.128059,-0.254282,0.218480,1.020356,-1.020356,-0.372362,-0.745813,0.959219
4564,-1.393941,-1.990744,-1.324177,-1.694465,3.177251,-0.725104,-1.071424,2.413019,-1.527556,-1.652743,...,0.006401,-0.241447,0.338054,0.783759,0.287271,1.020356,-1.020356,-0.372362,1.340818,-1.042515


In [19]:
# save X DataFrame
X_saved = X

In [20]:
# train test split
p = 0.8 # proportion of data used for training (rest used for testing)
n = len(X) # sample size

### Regression

In [21]:
X = X_saved

# Feature Selection
X = select_features(X, pts)

# sample size
n = len(X)

# model selection + training
regression_model = BayesianRidge(n_iter=1000)
regression_model.fit(X[:int(p*n)], pts[:int(p*n)])

# predictions
model_output = regression_model.predict(X[int(p*n):], return_std=True)
estimates = model_output[0]
predictions = estimates > lines[int(p*n):]
uncertainties = model_output[1]
probabilities = regression_probability(lines[int(p*n):], estimates, uncertainties)

regression_df = pd.DataFrame({'Player': players[int(p*n):], 
                             'Points': pts[int(p*n):], 
                             'Estimates': estimates,
                              'Line': lines[int(p*n):],
                              'Actual': bets[int(p*n):],
                            'RegressionPrediction': predictions,
                             'RegressionUncertainty': uncertainties,
                             'RegressionProbability': probabilities})

# results
accuracy = statistics.mean(regression_df['Actual'] == regression_df['RegressionPrediction'])
mae = metrics.mean_absolute_error(pts[int(p*n):], estimates)

print("Accuracy: " + str(round(accuracy*100,2)) + "%")
print("MAE:", round(mae,2))
print("Average Uncertainty:", round(statistics.mean(uncertainties),2))
regression_df

Selected: ['Player_PPG' 'Player_FGA' 'Player_2PM' 'Player_3P%' 'Player_FTM'
 'Player_FTA' 'Player_DRB' 'Player_STL' 'Player_PF' 'Player_PTSSTDEV'
 'Team_MP_x' 'Team_2PA_x' 'Team_2P%_x' 'Team_FTA_x' 'Team_3PA_y'
 'Team_3P%_y' 'Team_FTM_y' 'Win%_y' 'Away' 'Home']
Unselected: ['Player_MP' 'Player_FGM' 'Player_FG%' 'Player_2PA' 'Player_2P%'
 'Player_3PM' 'Player_3PA' 'Player_FT%' 'Player_TS%' 'Player_ORB'
 'Player_TRB' 'Player_AST' 'Player_BLK' 'Player_TOV' 'Player_GmSc'
 'Player_BPM' 'Team_FGM_x' 'Team_FGA_x' 'Team_FG%_x' 'Team_2PM_x'
 'Team_3PM_x' 'Team_3PA_x' 'Team_3P%_x' 'Team_FTM_x' 'Team_FT%_x'
 'Team_PTS_x' 'Win%_x' 'Team_MP_y' 'Team_FGM_y' 'Team_FGA_y' 'Team_FG%_y'
 'Team_2PM_y' 'Team_2PA_y' 'Team_2P%_y' 'Team_3PM_y' 'Team_FTA_y'
 'Team_FT%_y' 'Team_PTS_y' 'C' 'F' 'G']

Accuracy: 51.31%
MAE: 5.29
Average Uncertainty: 6.77


Unnamed: 0,Player,Points,Estimates,Line,Actual,RegressionPrediction,RegressionUncertainty,RegressionProbability
3652,Anthony Edwards,27.0,23.019411,24.5,True,False,6.769473,58.656418
3653,CJ McCollum,23.0,22.185972,25.5,False,False,6.767158,68.783477
3654,Corey Kispert,8.0,9.598315,11.5,False,False,6.769839,61.060854
3655,Jamal Murray,6.0,20.832418,20.5,False,True,6.765857,51.959280
3656,Joel Embiid,22.0,34.148670,31.5,False,True,6.783971,65.189091
...,...,...,...,...,...,...,...,...
4561,Santi Aldama,13.0,9.224255,6.5,True,True,6.758489,65.655766
4562,Anfernee Simons,16.0,22.209084,21.5,False,True,6.767083,54.172650
4563,Kyrie Irving,29.0,26.527205,23.5,True,True,6.771501,67.258082
4564,Drew Eubanks,4.0,5.699249,5.5,False,True,6.783745,51.171589


### Classification

In [22]:
X = X_saved

# Feature Selection ??

# add line as input col ??
X['Line'] = lines

# model selection + training
# classification_model = GaussianProcessClassifier()
classification_model = MLPClassifier(max_iter=1500)
classification_model.fit(X[:int(p*n)], bets[:int(p*n)])

# predictions
predictions = classification_model.predict(X[int(p*n):])
probabilities = [round(max(vals)*100, 2) for vals in classification_model.predict_proba(X[int(p*n):])]

classification_df = pd.DataFrame({'Player': players[int(p*n):], 
                             'Points': pts[int(p*n):],
                              'Line': lines[int(p*n):],
                              'Actual': bets[int(p*n):],
                            'ClassificationPrediction': predictions,
                             'ClassificationProbability': probabilities})

# results
accuracy = statistics.mean(classification_df['Actual'] == classification_df['ClassificationPrediction'])

print("Accuracy: " + str(round(accuracy*100,2)) + "%")
print(X.columns)
classification_df

Accuracy: 52.95%
Index(['Player_PPG', 'Player_MP', 'Player_FGM', 'Player_FGA', 'Player_FG%',
       'Player_2PM', 'Player_2PA', 'Player_2P%', 'Player_3PM', 'Player_3PA',
       'Player_3P%', 'Player_FTM', 'Player_FTA', 'Player_FT%', 'Player_TS%',
       'Player_ORB', 'Player_DRB', 'Player_TRB', 'Player_AST', 'Player_STL',
       'Player_BLK', 'Player_TOV', 'Player_PF', 'Player_GmSc', 'Player_BPM',
       'Player_PTSSTDEV', 'Team_MP_x', 'Team_FGM_x', 'Team_FGA_x',
       'Team_FG%_x', 'Team_2PM_x', 'Team_2PA_x', 'Team_2P%_x', 'Team_3PM_x',
       'Team_3PA_x', 'Team_3P%_x', 'Team_FTM_x', 'Team_FTA_x', 'Team_FT%_x',
       'Team_PTS_x', 'Win%_x', 'Team_MP_y', 'Team_FGM_y', 'Team_FGA_y',
       'Team_FG%_y', 'Team_2PM_y', 'Team_2PA_y', 'Team_2P%_y', 'Team_3PM_y',
       'Team_3PA_y', 'Team_3P%_y', 'Team_FTM_y', 'Team_FTA_y', 'Team_FT%_y',
       'Team_PTS_y', 'Win%_y', 'Away', 'Home', 'C', 'F', 'G', 'Line'],
      dtype='object')


Unnamed: 0,Player,Points,Line,Actual,ClassificationPrediction,ClassificationProbability
3652,Anthony Edwards,27.0,24.5,True,True,56.09
3653,CJ McCollum,23.0,25.5,False,False,74.07
3654,Corey Kispert,8.0,11.5,False,False,99.80
3655,Jamal Murray,6.0,20.5,False,True,50.93
3656,Joel Embiid,22.0,31.5,False,True,68.19
...,...,...,...,...,...,...
4561,Santi Aldama,13.0,6.5,True,True,99.31
4562,Anfernee Simons,16.0,21.5,False,False,75.04
4563,Kyrie Irving,29.0,23.5,True,True,77.06
4564,Drew Eubanks,4.0,5.5,False,True,99.90


### Testing

In [23]:
# combine Regression and Classification DataFrames
on = ['Player', 'Points', 'Line', 'Actual']
prediction_df = pd.merge(regression_df,classification_df, on=on)
prediction_df

# only select bets where predictions match
prediction_df = prediction_df[prediction_df['RegressionPrediction'] == prediction_df['ClassificationPrediction']]

# minimum probability thresholds for bets
prediction_df = prediction_df[prediction_df['RegressionProbability'] > 60]
prediction_df = prediction_df[prediction_df['ClassificationProbability'] > 98]

# results
correct_bets = sum(prediction_df['Actual'] == prediction_df['ClassificationPrediction'])
total_bets = len(prediction_df['Actual'])
incorrect_bets = total_bets - correct_bets
accuracy = correct_bets/total_bets

# calculate expected profit if every bet is -110 odds and $100 is placed on every bet
EV = (100*100/110)*correct_bets - 100*incorrect_bets

# results
print("Hits:", correct_bets)
print("Total Bets Made:", total_bets)
print("Total Bets Possible:", n-int(p*n))
print("Accuracy: " + str(round(accuracy*100,2)) + "%")
print("Expected Profit: $" + str(round(EV,2)))

prediction_df

Hits: 22
Total Bets Made: 38
Total Bets Possible: 914
Accuracy: 57.89%
Expected Profit: $400.0


Unnamed: 0,Player,Points,Estimates,Line,Actual,RegressionPrediction,RegressionUncertainty,RegressionProbability,ClassificationPrediction,ClassificationProbability
2,Corey Kispert,8.0,9.598315,11.5,False,False,6.769839,61.060854,False,99.8
13,Grant Williams,2.0,8.85812,6.5,False,True,6.765488,63.628693,True,99.8
25,Zion Williamson,33.0,22.845705,25.5,True,False,6.770526,65.248444,False,98.37
35,Jeff Green,10.0,9.094896,6.5,True,True,6.766637,64.931945,True,99.09
46,Jalen Duren,9.0,9.884861,12.5,False,False,6.774219,65.026755,False,99.43
81,Isaiah Joe,21.0,7.060564,9.5,True,False,6.766112,64.077732,False,99.96
136,Max Strus,8.0,14.763375,16.5,False,False,6.766311,60.127841,False,98.72
197,Danuel House Jr.,0.0,6.210848,3.5,False,True,6.766262,65.56581,True,99.51
211,Trey Murphy III,11.0,10.964202,13.5,False,False,6.768339,64.604177,False,99.42
257,Onyeka Okongwu,7.0,8.951453,13.5,False,False,6.771763,74.911045,False,99.06
