In [20]:
%pip install pandas scikit-learn tensorflow keras bokeh numpy scipy


Note: you may need to restart the kernel to use updated packages.


In [21]:
import pandas
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras.losses import MeanSquaredError
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Conv2D, Flatten, Dense, BatchNormalization, MaxPooling2D, Dropout
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool
import bokeh
import scipy
import numpy
from bokeh.plotting import output_notebook

In [22]:
#Creates path and filters player data so that at least 2000 minutes was played in that season
path = "AllStatsfrom2000.csv"
df = pandas.read_csv(path)
df["Total_Minutes"] = df["GP"] * df["MIN"]
df = df[df["Total_Minutes"]>=2000]
df["MIN"]

2        31.5
3        28.5
9        36.6
10       41.9
12       29.2
         ... 
11622    27.8
11636    32.2
11637    37.5
11642    29.7
11658    31.5
Name: MIN, Length: 2588, dtype: float64

In [23]:
#Gets list of every column name in dataset
columns = df.columns.to_list()

In [24]:
#Extra column to remove
drop_columns = ["PLAYER_ID"]

In [25]:
#Gets rid of every column that ends with RANK or FANTASY
#Dropped because they are unnecessary columns
df = df[[c for c in df.columns if not (c.endswith('RANK') or "FANTASY" in c)]]
df = df.drop(drop_columns,axis=1)

In [26]:
#Gets columns for data on person. Not required for model
index_cols = ["PLAYER_NAME","SEASON", "W_PCT","NICKNAME","TEAM_ID","TEAM_ABBREVIATION","W","L","Total_Minutes"]

In [27]:
#Creates the ID as the index, and creates the personal data in its own dataset
player_columns = df[index_cols]
player_columns["ID"] = df.index
player_columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_columns["ID"] = df.index


Unnamed: 0,PLAYER_NAME,SEASON,W_PCT,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,W,L,Total_Minutes,ID
2,Aaron McKie,2000-01,0.671,Aaron,1610612755,PHI,51,25,2394.0,2
3,Aaron Williams,2000-01,0.317,Aaron,1610612751,NJN,26,56,2337.0,3
9,Allan Houston,2000-01,0.577,Allan,1610612752,NYK,45,33,2854.8,9
10,Allen Iverson,2000-01,0.704,Allen,1610612755,PHI,50,21,2974.9,10
12,Alvin Williams,2000-01,0.573,Alvin,1610612761,TOR,47,35,2394.4,12
...,...,...,...,...,...,...,...,...,...,...
11622,Tre Jones,2023-24,0.273,Tre,1610612759,SAS,21,56,2140.6,11622
11636,Tyrese Haliburton,2023-24,0.580,Tyrese,1610612754,IND,40,29,2221.8,11636
11637,Tyrese Maxey,2023-24,0.614,Tyrese,1610612755,PHI,43,27,2625.0,11637
11642,Victor Wembanyama,2023-24,0.268,Victor,1610612759,SAS,19,52,2108.7,11642


In [28]:
#Drops the index Columns from original dataset
df = df.drop(index_cols,axis=1)

In [29]:
#Normalizes data to 0 and 1
df_non_normalized = df[:]
for column in df.columns.to_list():
    df[column] = (df[column] - df[column].min())/(df[column].max()-df[column].min())#Normalization

In [36]:
#Converts data and answers to numpy array (Like int arrays in Java and C++)
answers = player_columns["W_PCT"].to_numpy()
data = df.to_numpy()

In [31]:
#check the number of categories in the dataframe
len(df.keys())

55

In [37]:
#Splits Test and Train Data
total_train_percentage = 70
train_data, test_data, train_answers, test_answers = train_test_split(
      data, answers, train_size=total_train_percentage/100, random_state=69
  )
train_answers = train_answers.reshape(-1, 1)
test_answers = test_answers.reshape(-1, 1)

In [38]:
#CREATE AND COMPILE MODEL HERE. Please name Model "model"
model = Sequential([
    Dense(64, activation='relu', input_shape=(data.shape[1],)), # Dense layer with 64 units
    # The dense layers are the actual neural network layers with nodes and activation functions
    Dense(32, activation='relu', name='HiddenLayer'), # Hidden layer with 64 nodes - feel free to change this hyper-parameter
    Dense(1, activation='linear')  # Output layer with 1 unit for regression
])

# The learning rate hyper-parameter controls how fast the model learns.
# Think of the learning rate as step size when trying to reach a particular optimal point
# The perfect learning rate will get you there fastest, without overshooting or taking too long
# For most applications, 0.001 or 0.0001 will work well
learning_rate = 0.001

# We need to define the loss function and optimizer for our model
# The compile() function allows the model to be trained afterward
model.compile(
    loss = MeanSquaredError(),
    optimizer = Adam(learning_rate=learning_rate)
)

# Check the dimensions of our training data to ensure it is ready to use
print(train_data.shape, train_answers.shape)

# Train the model using the fit() function for 25 epochs
# Epochs is another hyper-parameter that controls the iterations over the training data
# More epochs will fit the model closer and closer to the training data; however, too many epochs can lead to overfitting
# Overfitting occurs when a model focuses too much on the training data and not being able to generalize to the testing data
model.fit(train_data, train_answers, epochs=25)

(1811, 55) (1811, 1)
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.src.callbacks.History at 0x779f17306910>

In [None]:
# This runs the model on all the testing data (consisting of the images and labels)
results = model.evaluate(train_data, train_answers)

In [39]:
#Plots Data
winning_percentages = list(model.predict(data))
winning_percentages = [list(i)[0] for i in winning_percentages]
winning_percentages = numpy.array(winning_percentages)

plots = []
for column in df.columns.tolist():
  column_vals = df_non_normalized[column].to_numpy()
  name = player_columns["PLAYER_NAME"].to_list()
  season = player_columns["SEASON"].to_list()
  nameAndSeason = [f"{name[i]}\n{season[i]}" for i in range(len(name))]
  slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(column_vals,winning_percentages)
  print(f"""
  {column} stats:
  Winning Percentage Change Per One: {slope*100}%
  r: {r_value}
  r^2: {r_value*r_value}
  standard deviation: {std_err}""")
  columns_vals = list(column_vals)
  winnings_percentages = list(winning_percentages)
  data = {'x': column_vals, 'y': winnings_percentages, 'labels': nameAndSeason}
  source = ColumnDataSource(data)
  # Create the plot
  p = figure(title="Interactive Plot", tools="pan,wheel_zoom,box_zoom,reset", tooltips=[("Label", "@labels"), ("(x, y)", "(@x, @y)")],x_axis_label=column,y_axis_label="Estimated Winning Percentage")
  p.scatter(x='x', y='y', size=10, source=source)
  plots.append(p)
grid = bokeh.layouts.column(plots)  # Arranged in two rows and two columns
# place the plot on a preview website
show(grid)
output_notebook()




  FG3M stats:
  Winning Percentage Change Per One: 2.064555570389%
  r: 0.13946115038149076
  r^2: 0.01944941246572878
  standard deviation: 0.002882664339262517

  FG3A stats:
  Winning Percentage Change Per One: 0.6385625899138502%
  r: 0.11113876860502608
  r^2: 0.01235182588704153
  standard deviation: 0.0011228578088463782

  FG3_PCT stats:
  Winning Percentage Change Per One: 10.427384449650372%
  r: 0.10074424404033058
  r^2: 0.010149402707257685
  standard deviation: 0.020250031728964495

  FTM stats:
  Winning Percentage Change Per One: 0.8107599514683327%
  r: 0.09976815463628214
  r^2: 0.009953684679529104
  standard deviation: 0.0015900612147988944

  FTA stats:
  Winning Percentage Change Per One: 0.6055408925739734%
  r: 0.0923837962022898
  r^2: 0.008534765800746216
  standard deviation: 0.001283429913438388

  FT_PCT stats:
  Winning Percentage Change Per One: 9.635608628074275%
  r: 0.060960114539076134
  r^2: 0.0037161355646172815
  standard deviation: 0.031024919660