In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [2]:
df = pd.read_csv('./dataScraping/allPlayers.csv')
print(df)

               PLAYER  GAME  MIN  REB  AST  TO  STL  BLK  PF  PTS  ...  \
0        aaron-gordon     0   28    4    5   3    0    0   4   18  ...   
1        aaron-gordon     1   39   13    7   1    1    4   2   16  ...   
2        aaron-gordon     2   25    6    2   2    0    1   6    8  ...   
3        aaron-gordon     3   31    4    2   1    1    0   1   18  ...   
4        aaron-gordon     4   30    5    2   1    1    0   2   12  ...   
...               ...   ...  ...  ...  ...  ..  ...  ...  ..  ...  ...   
5216  zion-williamson     6   32    1    3   3    1    0   2   30  ...   
5217  zion-williamson     7   35    6    5   1    0    1   1   30  ...   
5218  zion-williamson     8   30    5    7   4    1    0   2   19  ...   
5219  zion-williamson     9   19    6    1   1    1    1   3   12  ...   
5220  zion-williamson    10   29    4    6   2    1    1   3   27  ...   

      OFFENSE: Pts/Poss  OFFENSE: eFG%  OFFENSE: TOV%  OFFENSE: ORB%  \
0                 109.4          0.508 

In [3]:
# Fix string issue
def fix_string_issue (col):
  key = []
  key = df[col].unique()

  for i in range(0, len(df[col].unique())):
    df.loc[df[col] == df[col].unique()[i], col] = i

  return key

pname_key = fix_string_issue('PLAYER')

In [4]:
# Extract features (X) from the last 10 rows (excluding the first)
x = df.drop(columns=['GAME', 'MIN', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF', 'PTS'])  # All columns except the target column pts

# Extract the target variable (y) for the most recent game
y = df[['MIN', 'REB', 'AST', 'TO', 'STL', 'BLK', 'PF', 'PTS']] # Pts column

In [5]:
x

Unnamed: 0,PLAYER,W,L,Diff,OFFENSE: Pts/Poss,OFFENSE: eFG%,OFFENSE: TOV%,OFFENSE: ORB%,OFFENSE: FT Rate,DEFENSE: Pts/Poss,DEFENSE: eFG%,DEFENSE: TOV%,DEFENSE: ORB%,DEFENSE: FT Rate
0,0,15.0,33.0,-9.4,109.4,0.508,0.150,0.297,19.1,118.7,0.568,0.147,0.291,21.5
1,0,34.0,15.0,8.4,120.5,0.578,0.123,0.239,21.2,112.1,0.525,0.158,0.307,21.4
2,0,32.0,16.0,3.4,120.9,0.576,0.125,0.251,23.4,117.5,0.542,0.114,0.263,19.4
3,0,30.0,17.0,6.3,120.3,0.546,0.118,0.277,25.7,114.0,0.536,0.145,0.291,22.1
4,0,32.0,17.0,6.8,119.8,0.545,0.135,0.328,21.2,113.0,0.543,0.134,0.250,17.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5216,506,26.0,22.0,-0.0,118.7,0.565,0.125,0.250,20.0,118.7,0.564,0.140,0.293,20.0
5217,506,33.0,16.0,4.1,118.9,0.562,0.130,0.292,18.0,114.7,0.536,0.125,0.267,20.5
5218,506,21.0,24.0,0.7,118.7,0.560,0.146,0.303,19.9,118.0,0.548,0.126,0.269,22.1
5219,506,32.0,15.0,8.5,122.1,0.577,0.130,0.280,21.5,113.7,0.532,0.139,0.281,19.9


In [6]:
y

Unnamed: 0,MIN,REB,AST,TO,STL,BLK,PF,PTS
0,28,4,5,3,0,0,4,18
1,39,13,7,1,1,4,2,16
2,25,6,2,2,0,1,6,8
3,31,4,2,1,1,0,1,18
4,30,5,2,1,1,0,2,12
...,...,...,...,...,...,...,...,...
5216,32,1,3,3,1,0,2,30
5217,35,6,5,1,0,1,1,30
5218,30,5,7,4,1,0,2,19
5219,19,6,1,1,1,1,3,12


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
print(X_test)

     PLAYER     W     L  Diff  OFFENSE: Pts/Poss  OFFENSE: eFG%  \
1592    154  28.0  19.0   0.1              117.6          0.565   
691      66   9.0  39.0  -9.8              111.9          0.544   
2451    240  25.0  25.0  -0.2              115.4          0.561   
3745    365  32.0  16.0   3.4              120.9          0.576   
1513    146  26.0  23.0   0.8              113.3          0.532   
...     ...   ...   ...   ...                ...            ...   
3217    313  26.0  23.0  -1.8              113.8          0.536   
2480    242   9.0  39.0  -9.8              111.9          0.544   
4526    439  18.0  31.0  -5.7              108.8          0.520   
2732    267  21.0  27.0  -1.9              118.6          0.541   
841      80  28.0  19.0   0.1              117.6          0.565   

      OFFENSE: TOV%  OFFENSE: ORB%  OFFENSE: FT Rate  DEFENSE: Pts/Poss  \
1592          0.134          0.264              17.9              117.5   
691           0.133          0.222           

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
model = RandomForestClassifier(n_estimators= 1000, random_state = 42)

In [10]:
model.fit(X_train, Y_train)
prediction_test = model.predict(X_test)
prediction_test

array([[24,  3,  2, ...,  0,  0, 13],
       [23,  5,  5, ...,  1,  1, 21],
       [ 8,  0,  1, ...,  0,  0,  5],
       ...,
       [18,  2,  1, ...,  0,  1,  9],
       [10,  1,  2, ...,  0,  1,  7],
       [ 7,  2,  1, ...,  0,  0,  7]], dtype=int64)

In [11]:
# Check accuracy
acc = 0
acceptable_range = 3

Y_test_np = Y_test.values

for i in range(0, len(prediction_test)):
    for j in range(0, len(prediction_test[0])):
        acc += 0 if abs(Y_test_np[i, j]-prediction_test[i, j]) > acceptable_range else 1
acc /= len(prediction_test)*len(prediction_test[0])
print("Accuracy = ", acc)

Accuracy =  0.7099282296650717


In [12]:
# Saving/loading model
import joblib

def model_save ():
    joblib.dump(model, 'player-stats.joblib')

def model_load():
    model = joblib.load('player-stats.joblib')

In [13]:
# Save model (15.4 GB holy fuck)
model_save()

In [14]:
# Check which features contribute the most
feature_list = list(x.columns)
feature_imp = pd.Series(model.feature_importances_, index = feature_list).sort_values(ascending = False)
feature_imp

PLAYER               0.961433
DEFENSE: FT Rate     0.003211
DEFENSE: ORB%        0.003177
DEFENSE: TOV%        0.003174
OFFENSE: eFG%        0.003162
OFFENSE: FT Rate     0.003069
OFFENSE: TOV%        0.003052
OFFENSE: ORB%        0.003036
DEFENSE: Pts/Poss    0.002946
OFFENSE: Pts/Poss    0.002889
Diff                 0.002852
W                    0.002698
DEFENSE: eFG%        0.002696
L                    0.002605
dtype: float64