# Box Score Data (Step Two)

In [None]:
## libraries 
import pandas as pd 
import numpy as np 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import glm
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LassoCV
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

import warnings
warnings.filterwarnings("ignore")

## Understanding Box Score data 
- each line is a game associated with a player 
- season ID, Team ID, Player ID, Team Name, leagueID, confID
- do not have a lot of info for international

In [None]:
# Box score data for all the possible players in US
box_data = pd.read_csv("Box Score Aggregates.csv")

# Box score data for international players 
intern_data = pd.read_excel("International Stats.xlsx")

### Changing the variables

In [None]:
def ratings(df_aggregated):
    df_aggregated['FGM_30']=(30*df_aggregated['FGM']/df_aggregated['MP'])
    df_aggregated['FGA_30']=(30*df_aggregated['FGA']/df_aggregated['MP'])
    df_aggregated['TPM_30']=(30*df_aggregated['TPM']/df_aggregated['MP'])
    df_aggregated['TPA_30']=(30*df_aggregated['TPA']/df_aggregated['MP'])
    df_aggregated['FTM_30']=(30*df_aggregated['FTM']/df_aggregated['MP'])
    df_aggregated['FTA_30']=(30*df_aggregated['FTA']/df_aggregated['MP'])
    df_aggregated['FOULS_30']=(30*df_aggregated['PERSONAL_FOULS']/df_aggregated['MP'])
    df_aggregated['BLOCKS_30']=(30*df_aggregated['BLOCKS']/df_aggregated['MP'])
    df_aggregated['STEALS_30']=(30*df_aggregated['STEALS']/df_aggregated['MP'])
    df_aggregated['ASSISTS_30']=(30*df_aggregated['ASSISTS']/df_aggregated['MP'])
    df_aggregated['OREB_30']=(30*df_aggregated['OREB']/df_aggregated['MP'])
    df_aggregated['DREB_30']=(30*df_aggregated['DREB']/df_aggregated['MP'])
    df_aggregated['TOV_30']=(30*df_aggregated['TURNOVERS']/df_aggregated['MP'])
    df_aggregated['FG_PCT']=(df_aggregated['FGM']/df_aggregated['FGA'])
    df_aggregated['TP_PCT']=(df_aggregated['TPM']/df_aggregated['TPA'])
    df_aggregated['FT_PCT']=(df_aggregated['FTM']/df_aggregated['FTA'])
    df_aggregated['USG']=(df_aggregated['USAGE_NUMERATOR']/df_aggregated['USAGE_DENOMINATOR'])
    df_aggregated['ASSIST_PCT']=(df_aggregated['ASSISTS']/df_aggregated['ASSIST_PCT_DENOM'])
    df_aggregated['BLOCK_PCT']=(df_aggregated['BLOCK_PCT_NUM']/df_aggregated['BLOCK_PCT_DENOM'])
    df_aggregated['OREB_PCT']=(df_aggregated['OREB_PCT_NUM']/df_aggregated['OREB_PCT_DENOM'])
    df_aggregated['DREB_PCT']=(df_aggregated['DREB_PCT_NUM']/df_aggregated['DREB_PCT_DENOM'])
    df_aggregated['STL_PCT']=(df_aggregated['STL_PCT_NUM']/df_aggregated['STL_PCT_DENOM'])
    df_aggregated['TOV_PCT']=(df_aggregated['TURNOVERS']/df_aggregated['TOV_PCT_DENOM'])
   
    return df_aggregated.fillna(0)

In [None]:
intern_data = ratings(intern_data)
intern_data['FTM_Rate']=(intern_data['FTM']/intern_data['FGA'])
box_data['FTM_Rate']=(box_data['FTM']/box_data['FGA'])

### Making the Data Match

In [None]:
intern_data.rename({'Player ID': 'PLAYER_ID'}, axis=1, inplace=True)

In [None]:
box = list(box_data.columns)
intern = list(intern_data.columns)

In [None]:
## looking at difference between the columns
intern_drop = list(set(intern) - set(box))
box_drop = list(set(box) - set(intern))

In [None]:
intern_data = intern_data.drop(intern_drop, axis=1)
box_data = box_data.drop(box_drop, axis=1)

In [None]:
## need to add these back in 
intern_data = intern_data[['PLAYER_ID','SEASON','PLAYER_GAMES', 'FGM_30','FGA_30','TPM_30','TPA_30','FTM_30','FTA_30',
                          'FOULS_30','BLOCKS_30','STEALS_30','ASSISTS_30','OREB_30','DREB_30',
                          'TOV_30','FG_PCT','TP_PCT','FT_PCT','USG','ASSIST_PCT','BLOCK_PCT','OREB_PCT',
                          'DREB_PCT','FTM_Rate','STL_PCT','TOV_PCT']]
box_data = box_data[['PLAYER_ID','SEASON','PLAYER_GAMES', 'FGM_30','FGA_30','TPM_30','TPA_30','FTM_30','FTA_30',
                    'FOULS_30','BLOCKS_30','STEALS_30','ASSISTS_30','OREB_30','DREB_30',
                    'TOV_30','FG_PCT','TP_PCT','FT_PCT','USG','ASSIST_PCT','BLOCK_PCT','OREB_PCT',
                    'DREB_PCT','FTM_Rate','STL_PCT','TOV_PCT']]

## Next Steps

### Get NBA Players for Both DataFrames

In [None]:
## reads the autostats player_id data in 
nba = pd.read_csv("pick_num_data.csv").drop('Unnamed: 0', axis=1)

In [None]:
nba['PLAYER_ID'].nunique()

In [None]:
nba2 = pd.read_csv('NBA Caliber Players.csv')

In [None]:
nba2['PLAYER'].nunique()

In [None]:
## making sure start season is right 
nba2 = nba2[nba2['START_SEASON'] > 2012]

## grabbing only columns needed 
nba2 = nba2[['PLAYER_ID', 'START_SEASON']]

In [None]:
nba['PLAYER_ID'].nunique()

In [None]:
## merging the nba players with the box data to get only the players of NBA caliber 
box_data = pd.merge(nba2, box_data, how='inner', on='PLAYER_ID')

intern_data = pd.merge(nba2, intern_data, how='inner', on='PLAYER_ID')

In [None]:
box_data['PLAYER_ID'].nunique()

In [None]:
intern_data['PLAYER_ID'].nunique()

### Aggregating each Dataframe

In [None]:
## sums the columns based on player id and season to get a sum of the overall season 
box_data = pd.DataFrame(box_data.groupby(['PLAYER_ID', 'START_SEASON']).agg('sum').reset_index())

intern_data = pd.DataFrame(intern_data.groupby(['PLAYER_ID', 'START_SEASON']).agg('sum').reset_index())

In [None]:
## sums the columns based on player id and season to get a sum of the overall season 
# box_data = pd.DataFrame(box_data.groupby(['PLAYER_ID', 'SEASON','PICK_NUMBER','START_SEASON']).agg('sum').reset_index())

# intern_data = pd.DataFrame(intern_data.groupby(['PLAYER_ID', 'SEASON','PICK_NUMBER', 'START_SEASON']).agg('sum').reset_index())

In [None]:
## columns that need an average
cols = box_data.drop(['PLAYER_ID', 'SEASON', 'START_SEASON'], axis=1)

In [None]:
box_data = box_data.astype('float')

intern_data = intern_data.astype('float')

In [None]:
## getting an average for each column 
for i in cols: 
    box_data[i] = box_data[i]/box_data['PLAYER_GAMES']
    
## getting an average for each column 
for i in cols: 
    intern_data[i] = intern_data[i]/intern_data['PLAYER_GAMES']

In [None]:
## creates a percentage of each player for each season/league 
a = box_data.merge(pd.DataFrame(box_data.groupby('PLAYER_ID') ['PLAYER_GAMES'].sum()), on='PLAYER_ID')
box_data['percent'] = (a['PLAYER_GAMES_x']/a['PLAYER_GAMES_y'])

## creates a percentage of each player for each season/league 
a2 = intern_data.merge(pd.DataFrame(intern_data.groupby('PLAYER_ID') ['PLAYER_GAMES'].sum()), on='PLAYER_ID')
intern_data['percent'] = (a2['PLAYER_GAMES_x']/a2['PLAYER_GAMES_y'])

In [None]:
## columns that need to be multiplied 
num_cols = box_data.drop(['PLAYER_ID', 'SEASON', 'START_SEASON', 'PLAYER_GAMES', 'percent'], axis=1)

In [None]:
## gets the value based on percent of each variable based on game
for i in num_cols:
    box_data[i] = box_data[i] * box_data['percent']
    
## gets the value based on percent of each variable based on game
for i in num_cols:
    intern_data[i] = intern_data[i] * intern_data['percent']

In [None]:
## create a rank variable that gives rank to the player and season with the most recent season being 1 
box_data['rank'] = box_data.groupby(['PLAYER_ID'])['SEASON'].rank('dense', ascending=False)

In [None]:
## creates a variable based on the rank to multiple the values
percentage = []
for i in range(0,len(box_data)):
    if box_data.loc[i, 'rank'] == 1: 
            percentage.append(2)
    elif box_data.loc[i, 'rank'] == 2: 
            percentage.append(3)
    elif box_data.loc[i, 'rank'] == 3: 
            percentage.append(5)
    elif box_data.loc[i, 'rank'] == 4: 
            percentage.append(5)
    elif box_data.loc[i, 'rank'] == 5: 
            percentage.append(5)
    elif box_data.loc[i, 'rank'] >= 6: 
            percentage.append(5)

In [None]:
## double checks the length
print(len(percentage))
print(len(box_data))

In [None]:
## adds percent into the data 
box_data['Rank_Value'] = percentage

In [None]:
## creates the columns needed to multiple the value by 
num_col = box_data.drop(['PLAYER_ID', 'SEASON', 'PLAYER_GAMES', 'rank'], axis=1)

## multiplies each row by its corresponding multiplier 
for i in num_col:
    box_data[i] = box_data[i] * box_data['Rank_Value']

In [None]:
nba2 = pd.read_csv('NBA Caliber Players.csv')
NBA = nba2[['PLAYER_ID', 'PICK_NUMBER']].drop_duplicates()

In [None]:
## grab columns we will put in later 
# cats = box_data[['PLAYER_ID','PICK_NUMBER']]
# cats = box_data.groupby(['PLAYER_ID']).agg({'PICK_NUMBER':'first'}).reset_index()

# ## grab columns we will put in later 
# cats2 = intern_data[['PLAYER_ID','PICK_NUMBER']]
# cats2 = intern_data.groupby(['PLAYER_ID']).agg({'PICK_NUMBER':'first'}).reset_index()

In [None]:
## drop unneeded columns 
box_data.drop(['SEASON', 'percent', 'rank', 'Rank_Value', 'START_SEASON'], axis=1, inplace=True)

## drop unneeded columns 
intern_data.drop(['SEASON', 'percent', 'START_SEASON'], axis=1, inplace=True)

In [None]:
## sums up the variables based on player for an aggregation
box = box_data.groupby(['PLAYER_ID'], as_index=False).agg('sum')
international = intern_data.groupby(['PLAYER_ID'], as_index=False).agg('sum')

In [None]:
## adds needed columns back in 
box = pd.merge(box, NBA, how='inner', on='PLAYER_ID')

## adds needed columns back in 
international = pd.merge(international, NBA, how='inner', on='PLAYER_ID')

### Combining International and US 

In [None]:
len(box) + len(international)

In [None]:
combined = pd.concat([box, international], axis=0)
combined = combined.reset_index(drop=True)

In [None]:
combined.to_csv('box_data.csv')

## Set up data for modeling

In [None]:
combined['PICK_NUMBER'].unique()

In [None]:
# putting the Pick Number into the bins 

combined['PICK_NUMBER'].replace(dict.fromkeys([1,2,3,4], '1-4'), inplace=True)
combined['PICK_NUMBER'].replace(dict.fromkeys([5,6,7,8], '5-8'), inplace=True)
combined['PICK_NUMBER'].replace(dict.fromkeys([9,10,11,12], '9-12'), inplace=True)
combined['PICK_NUMBER'].replace(dict.fromkeys([13,14,15,16,17,18], '13-18'), inplace=True)

combined['PICK_NUMBER'].replace(dict.fromkeys([19,20,21,22,23,24,25,26,27,28,29,30], '19-30'), inplace=True)
combined['PICK_NUMBER'].replace(dict.fromkeys([31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60], '31-60'), inplace=True)


In [None]:
combined['PICK_NUMBER'] = combined['PICK_NUMBER'].fillna('Undrafted')

In [None]:
combined.to_csv("box_data.csv")

In [None]:
X = college_box.drop(['PICK_NUMBER', 'PLAYER_ID'], axis=1)
y = college_box['PICK_NUMBER']

## PCA

In [None]:
## Normalize the data 
X=(X-X.mean())/(X.max()-X.min())

In [None]:
# splitting with new data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

y_train.reset_index(drop=True, inplace=True) 
y_test.reset_index(drop=True, inplace=True)
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [None]:
pca_all = PCA(n_components=22)
pca_all.fit(X)

In [None]:
plt.grid()
plt.plot(np.cumsum(pca_all.explained_variance_ratio_ * 100))
plt.xlabel('Number of components')
plt.ylabel('Explained variance')

In [None]:
# train and test sets
pca_10 = PCA(n_components=10)
pca_10.fit(X_train)
X_train_reduced = pca_10.transform(X_train)
X_test_reduced = pca_10.transform(X_test)

# verify shape after PCA
print("Shape:", X_train_reduced.shape)
print("Test images shape: ", X_test_reduced.shape)

# get exact variability retained
print("\nVar retained (%):", 
      np.sum(pca_10.explained_variance_ratio_ * 100))

## Neural Network

### W/ PCA

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
import keras

In [None]:
y_train = y_train.astype('string').sort_values()
y_test = y_test.astype('string').sort_values()

In [None]:
# encode labels as integers for y_train 
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y_train = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = np_utils.to_categorical(encoded_Y_train)


In [None]:
## allows you to see which categorical value it is 
list(encoder.inverse_transform(encoded_Y_train));

In [None]:
# encode labels as integers for y_test 
encoder = LabelEncoder()
encoder.fit(y_test)
encoded_Y_test = encoder.transform(y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_test = np_utils.to_categorical(encoded_Y_test)

In [None]:
1/y_train.value_counts()*100

In [None]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced', classes = np.unique(y_train)
                                               , y = y_train)

In [None]:
class_weight

In [None]:
class_weights = {0: 2.83333333, 1:1.56578947, 2: 0.81506849, 3: 0.33426966, 4: 2.83333333, 5:2.28846154}

In [None]:
#class_weights = {0: 3.448276, 1: 2.5, 2:1.449275, 3: 0.625, 4: 5.0, 5: 3.571429, 6:0.104493}

In [None]:
# build a model
model = Sequential()
model.add(Dense(10, input_shape=(X_train_reduced.shape[1],), activation='relu')) # input shape is (features,)
#model.add(Dense(25, activation='softmax'))
#model.add(Dense(16, activation='softmax'))
model.add(Dense(6, activation='softmax'))
model.summary()

# compile the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', # this is different instead of binary_crossentropy (for regular classification)
              metrics=['accuracy'])

In [None]:
# early stopping callback
# This callback will stop the training when there is no improvement in  
# the validation loss for 10 consecutive epochs.  
es = keras.callbacks.EarlyStopping(monitor='val_loss', 
                                   mode='min',
                                   patience=10, 
                                   restore_best_weights=True) 

# now we just update our model fit call
history =  model.fit(X_train_reduced,
                    dummy_y_train,
                    callbacks=[es],
                    epochs=1000, 
                    class_weight = class_weights,
                    batch_size=10,
                    shuffle=True,
                    validation_split=0.2,
                    verbose=1)

In [None]:
history_dict = history.history

# learning curve
# accuracy
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']

# loss
loss = history_dict['loss']
val_loss = history_dict['val_loss']

# range of X (no. of epochs)
epochs = range(1, len(acc) + 1)

# plot
# "r" is for "solid red line"
plt.plot(epochs, acc, 'r', label='Training accuracy')
# b is for "solid blue line"
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

preds = model.predict(X_train_reduced) # see how the model did!
print(preds[4]) # i'm spreading that prediction across three nodes and they sum to 1
print(np.sum(preds[0])) # sum it up! Should be 1

predictions = preds.argmax(axis=1)

cm = confusion_matrix(encoded_Y_train, predictions)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt     

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['1-4', '13-18', '19-30', '31-60', '5-8', '9-12']); ax.yaxis.set_ticklabels(['1-4', '13-18', '19-30', '31-60', '5-8', '9-12']);

### W/o PCA

In [None]:
X_train_array = np.array(X_train)
X_test_array = np.array(X_test)

In [None]:
# build a model
model2 = Sequential()
model2.add(Dense(22, input_shape=(X_train_array.shape[1],), activation='relu')) # input shape is (features,)
#model.add(Dense(25, activation='softmax'))
#model.add(Dense(16, activation='softmax'))
model2.add(Dense(6, activation='softmax'))
model2.summary()

# compile the model
model2.compile(optimizer='adam', 
              loss='categorical_crossentropy', # this is different instead of binary_crossentropy (for regular classification)
              metrics=['accuracy'])

In [None]:
# early stopping callback
# This callback will stop the training when there is no improvement in  
# the validation loss for 10 consecutive epochs.  
es = keras.callbacks.EarlyStopping(monitor='val_loss', 
                                   mode='min',
                                   patience=10, 
                                   restore_best_weights=True) 

# now we just update our model fit call
history =  model2.fit(X_train_array,
                    dummy_y_train,
                    callbacks=[es],
                    epochs=5000, 
                    class_weight = class_weights,
                    batch_size=10,
                    shuffle=True,
                    validation_split=0.2,
                    verbose=1)

In [None]:
history_dict = history.history

# learning curve
# accuracy
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']

# loss
loss = history_dict['loss']
val_loss = history_dict['val_loss']

# range of X (no. of epochs)
epochs = range(1, len(acc) + 1)

# plot
# "r" is for "solid red line"
plt.plot(epochs, acc, 'r', label='Training accuracy')
# b is for "solid blue line"
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax);  #annot=True to annotate cells, ftm='g' to disable scientific notation

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['1-4', '13-18', '19-30', '31-60', '5-8', '9-12']); ax.yaxis.set_ticklabels(['1-4', '13-18', '19-30', '31-60', '5-8', '9-12']);