In [1]:
import math
import json
import xgboost
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split

### Load Excel Spreadsheet Containing Various Player Data as Sheets

In [2]:
xls = pd.ExcelFile("master_datasheet.xlsx")
sentiment = pd.read_csv("nba_games_data_sentimentanalysis_weighted.csv")

with open('reddit_sentiment_scores.json', 'r') as f:
     reddit_polarity = json.load(f)

# Print Player Sheet Names for Validation
sheetnames = []
for sheets in xls.sheet_names:
#     if "Games" in sheets:
        sheetnames.append(sheets)
        
print(sheetnames)

['README', 'TeamStat - Seasons 16-21', 'James Harden', 'Trevor Ariza', 'Eric Gordon', 'Ryan Anderson', 'Patrick Beverly', 'PJ Tucker', 'Clint Capela', 'Chris Paul', 'Russell Westbrook', 'Danuel House Jr.', 'Ben McLemore', 'JaeSean Tate', 'Christian Wood', 'John Wall', 'Sterling Brown', 'Kenyon Martin Jr.', 'Jalen Green', 'Kevin Porter Jr.', 'Garrison Matthews']


In [3]:
# Extract Player Historical Games
player_sheets = ['James Harden', 'Trevor Ariza', 'Eric Gordon', 'Ryan Anderson', 'Patrick Beverly', 
                 'PJ Tucker', 'Clint Capela', 'Chris Paul', 'Russell Westbrook', 'Danuel House Jr.', 
                 'Ben McLemore', 'JaeSean Tate', 'Christian Wood', 'John Wall', 'Sterling Brown', 
                 'Kenyon Martin Jr.', 'Jalen Green', 'Kevin Porter Jr.', 'Garrison Matthews']

# Load as Data Frames of Players as Map
player_dfs = {}

for player in player_sheets:
    # Parse into DF
    df = xls.parse(player)
    # Extract Player Name
#     player_name = " ".join(player.split(" ")[0:-2])
#     player_dfs[player_name] = df
    player_dfs[player] = df

In [4]:
# Transform Data into Match Data Point by Play Dates
matches_dfs = {}

# Y Data / Target Metrics
matches_outcome = {}

# Target Metrics / Predictor - Modify To Add More
target_predictors = ["FG%", "3P%", "FT%"]

# Enumerate Player Numbers
player_enumeration = {}
for idx, player in enumerate(player_dfs.keys()):
    player_enumeration[player] = idx
    
# Enumerate Opponent Numbers
opponent_count = 0
opponent_enumeration = {}


# Helper Functions
def FixNaN(value):
    return 0.0 if np.isnan(float(value)) else value

# Iterate Through All Players 
for player in player_dfs:
    # Go Through All Plays per Player - Record Contributions of Player per Game
    for idx, row in player_dfs[player].iterrows():
        # Extract Game Date
        game_date = str(row.Date).split(" ")[0]
            
        # Populate and Retrieve Opponent Enumeration
        if row.Opp not in opponent_enumeration:
            opponent_enumeration[row.Opp] = opponent_count
            # Update Count
            opponent_count += 1
            
        # Set Player Stats
        if row.Tm == 'HOU': #played for HOU not other teams
            if sum([FixNaN(x) for x in row[target_predictors]]) > 0.01:
                # Initialize Game Record if Not Existing
                if game_date not in matches_dfs:
                    matches_dfs[game_date] = {}
                    matches_dfs[game_date]["opponent"] = opponent_enumeration[row.Opp]
                    matches_dfs[game_date]["sentiment"] = 0
                    matches_dfs[game_date]["reddit_polarity"] = 0
            
                matches_dfs[game_date][player] = [FixNaN(x) for x in row[target_predictors]]
                # Add Player Enumeration
                matches_dfs[game_date][player] = [player_enumeration[player], *matches_dfs[game_date][player]]

                # Set Match Outcome as Y_Data or Target Data
                matches_outcome[game_date] = int(row["Unnamed: 7"].split("(")[1].split(")")[0])
                
# Process Twitter Sentiment Data
for idx, record in sentiment.iterrows():
    if record.Date in matches_dfs:
        matches_dfs[record.Date]["sentiment"] = record["vader_compound"]
            

# Add Reddit Polarity Scores
for date in reddit_polarity:
    if date in matches_dfs:
        matches_dfs[date]["reddit_polarity"] = reddit_polarity[date]
    

# Clean Data of Degenerate Records
cleaned_matches_dfs = {}
cleaned_matches_outcome = {}

for idx, match in enumerate(matches_dfs):
    if len(matches_dfs[match].keys()) > 1:
        cleaned_matches_dfs[match] = matches_dfs[match]
        cleaned_matches_outcome[match] = matches_outcome[match]

In [5]:
assert len(cleaned_matches_dfs) == len(cleaned_matches_outcome)

In [12]:
# Get dates for reddit API
# pd.DataFrame({'date' : list(cleaned_matches_dfs.keys())
#              }).to_csv('reddit_sentiment.csv', index=False)

In [77]:
# Considering that we have 2 datasets, we have to synchronize the ordering before performing a train-test-split
# Prepare Dataframes for ML Model Traininig
matches_ordering = []
X_data = []
y_data = []

# Helper Function to Flatten Map
def flatten(d):    
    arr = []
    if isinstance(d, dict):
        for key, val in d.items():

            arr.extend(flatten(val))
    elif isinstance(d, list):
        arr = d   
    elif isinstance(d, int):
        arr = [d]
    return arr

# Create X Data
for match in cleaned_matches_dfs:
    # Append to Order
    matches_ordering.append(match)
    
    # Generate Template Data
    one_hot_array = [[0,0,0]] * len(player_enumeration.keys())
    
    for player in cleaned_matches_dfs[match]:
        if player is not "opponent" and player is not "sentiment" and player is not "reddit_polarity":
            player_idx, *data = cleaned_matches_dfs[match][player]
            # Embed
            one_hot_array[player_idx] = data
    
    # Expand and Flatten Map
    arr = np.array(one_hot_array).flatten()
    X_data.append([cleaned_matches_dfs[match]["sentiment"], cleaned_matches_dfs[match]["reddit_polarity"], 
                   cleaned_matches_dfs[match]["opponent"], *arr])
    
# Create Y Data (Target Vars)
y_data = [cleaned_matches_outcome[match] for match in matches_ordering]

# Check Data
pd.DataFrame(X_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,0.000000,-0.069602,0,0.474,0.143,0.938,0.250,0.250,0.000,0.467,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1,0.000000,0.138927,1,0.500,0.444,0.800,0.667,0.714,1.000,0.444,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
2,0.000000,-0.069643,1,0.348,0.333,0.750,0.500,0.500,0.000,0.313,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
3,0.000000,0.025595,2,0.650,0.556,0.714,0.364,0.300,0.000,0.462,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
4,0.000000,0.027588,3,0.563,0.417,0.875,0.300,0.200,0.500,0.545,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
5,0.000000,0.046875,4,0.526,0.500,0.857,0.400,0.286,0.000,0.273,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
6,0.000000,-0.077778,5,0.474,0.429,1.000,0.455,0.444,1.000,0.400,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
7,0.000000,0.153125,6,0.474,0.200,0.714,0.500,0.200,0.000,0.357,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
8,0.000000,-0.025714,6,0.389,0.222,0.818,0.385,0.364,0.500,0.625,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
9,0.000000,0.251374,7,0.600,0.500,1.000,0.429,0.455,0.000,0.500,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [79]:
X_data = pd.DataFrame(X_data)

In [80]:
# X_data = X_data.drop([1], axis=1)

In [81]:
# Training Test Split 80-20 In this Case
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2,
                                                        random_state=1993)

In [75]:
# Initialize and Perform RFR (Limit our Max_Features)
regressor = RandomForestRegressor(n_estimators=100, random_state=1234, 
                                  max_features="log2", bootstrap=False)  
regressor.fit(X_train, y_train)  
# Use Regressor to Predict and Evaluate
y_pred = regressor.predict(X_test)

# Print Statistics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(
                                          y_test, y_pred))) 

print('Accuracy:', metrics.accuracy_score([x < 0 for x in y_test], [x < 0 for x in y_pred]))

Mean Absolute Error: 9.311789473684211
Mean Squared Error: 150.83592421052631
Root Mean Squared Error: 12.28152776369969
Accuracy: 0.6842105263157895


In [82]:
# Test Out Classification Model
X_train, X_test, y_train, y_test = train_test_split(X_data, [x < 0 for x in y_data], test_size=0.2,
                                                        random_state=2134)

classifier = RandomForestClassifier(n_estimators=100, random_state=1993, 
                                  max_features="log2", bootstrap=False)  
classifier.fit(X_train, y_train)  
# Use Regressor to Predict and Evaluate
y_pred = classifier.predict(X_test)

# Print Statistics
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
print('F1:', metrics.f1_score(y_test, y_pred))


Accuracy: 0.7157894736842105
F1: 0.6746987951807228


In [83]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2,
                                                        random_state=1334)

XGBModel = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.07, gamma=0, 
                              subsample=0.7, colsample_bytree=1, max_depth=12,
                              min_child_weight=1.8)

XGBModel.fit(X_train, y_train)  
# Use Regressor to Predict and Evaluate
y_pred = XGBModel.predict(X_test)

# Print Statistics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(
                                          y_test, y_pred))) 

print('Accuracy:', metrics.accuracy_score([x < 0 for x in y_test], [x < 0 for x in y_pred]))
print('F1:', metrics.f1_score([x < 0 for x in y_test], [x < 0 for x in y_pred]))

Mean Absolute Error: 9.485509202903822
Mean Squared Error: 154.7461131840959
Root Mean Squared Error: 12.439699079322454
Accuracy: 0.7473684210526316
F1: 0.7391304347826085


In [24]:
def bayes_tune(max_depth, m_features, n_estimators , m_criterion):
    
    max_features = ["auto", "sqrt", "log2"][int(m_features)]
    criterion = ["squared_error", "absolute_error"][int(m_criterion)]
    
    regressor = RandomForestRegressor(max_depth = int(max_depth),
                                      n_estimators = int(n_estimators),
                                      max_features = max_features,
                                      criterion = criterion,
                                      bootstrap=False) 
    
    regressor.fit(X_train, y_train)  
    # Use Regressor to Predict and Evaluate
    y_pred = regressor.predict(X_test)

    # Print Statistics
    return -np.sqrt(metrics.mean_squared_error(y_test, y_pred))

optimized_search = BayesianOptimization(bayes_tune, {
                                            'max_depth': (3, 50), 
                                            'm_features': (0,2),
                                            'm_criterion': (0.0, 1.5),
                                            'n_estimators': (100, 200)
                                        })

In [44]:
# Run Bayes Opt
optimized_search.maximize(n_iter = 100, init_points = 2)

|   iter    |  target   | m_crit... | m_feat... | max_depth | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.5413  [0m | [0m 1.496   [0m | [0m 0.5376  [0m | [0m 4.737   [0m | [0m 186.7   [0m |
| [95m 2       [0m | [95m-0.4766  [0m | [95m 1.167   [0m | [95m 0.9444  [0m | [95m 49.3    [0m | [95m 153.9   [0m |
| [0m 3       [0m | [0m-0.5613  [0m | [0m 0.0289  [0m | [0m 0.6961  [0m | [0m 49.16   [0m | [0m 153.2   [0m |
| [0m 4       [0m | [0m-0.4826  [0m | [0m 1.482   [0m | [0m 0.9016  [0m | [0m 20.47   [0m | [0m 137.3   [0m |
| [95m 5       [0m | [95m-0.4461  [0m | [95m 1.25    [0m | [95m 1.588   [0m | [95m 17.33   [0m | [95m 122.3   [0m |
| [95m 6       [0m | [95m-0.4244  [0m | [95m 0.9491  [0m | [95m 1.677   [0m | [95m 42.24   [0m | [95m 117.1   [0m |
| [95m 7       [0m | [95m-0.4148  [0m | [95m 0.4168  [0m | [95m 1.065   [0m | [95m 30.22   [

| [0m 67      [0m | [0m-0.4273  [0m | [0m 0.5121  [0m | [0m 1.178   [0m | [0m 43.42   [0m | [0m 113.2   [0m |
| [0m 68      [0m | [0m-0.5628  [0m | [0m 0.7552  [0m | [0m 0.9995  [0m | [0m 41.77   [0m | [0m 112.9   [0m |
| [0m 69      [0m | [0m-0.4242  [0m | [0m 0.3605  [0m | [0m 1.308   [0m | [0m 44.69   [0m | [0m 113.3   [0m |
| [0m 70      [0m | [0m-0.4249  [0m | [0m 0.01944 [0m | [0m 1.915   [0m | [0m 43.63   [0m | [0m 114.0   [0m |
| [0m 71      [0m | [0m-0.56    [0m | [0m 0.2536  [0m | [0m 0.2803  [0m | [0m 44.82   [0m | [0m 113.9   [0m |
| [0m 72      [0m | [0m-0.4402  [0m | [0m 1.279   [0m | [0m 1.804   [0m | [0m 44.46   [0m | [0m 112.8   [0m |
| [0m 73      [0m | [0m-0.4157  [0m | [0m 0.03991 [0m | [0m 1.855   [0m | [0m 22.77   [0m | [0m 172.3   [0m |
| [0m 74      [0m | [0m-0.4191  [0m | [0m 0.005153[0m | [0m 1.809   [0m | [0m 45.17   [0m | [0m 112.7   [0m |
| [0m 75      [0m | [

In [25]:
# Execute Best Result
optimized_search.max

{}

In [26]:
# Training Test Split 80-20 In this Case
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2,
                                                        random_state=20)
# Initialize and Perform RFR (Limit our Max_Features)
regressor = RandomForestRegressor(n_estimators=103, max_depth=15, 
                                  max_features="log2", bootstrap=False)  
regressor.fit(X_train, y_train)  
# Use Regressor to Predict and Evaluate
y_pred = regressor.predict(X_test)

# Print Statistics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(
                                          y_test, y_pred))) 

print('Accuracy:', metrics.accuracy_score([x < 0 for x in y_test], [x < 0 for x in y_pred]))

Mean Absolute Error: 10.178994589502127
Mean Squared Error: 163.86955044687053
Root Mean Squared Error: 12.801154262287074
Accuracy: 0.7263157894736842


In [None]:
# play by play (this game is not in the initial dataset, if you want, i can get data from an older game, 
# but it wont be clean anyways, coz we didnt have stat from everyone who played in those games - just the top few)
with open('play-by-play-2022-04-10.json', 'r') as f:
     reddit_texts = json.load(f)
        
# players appeared in this game (9 players)
#   'A. Şengün','D. Nwaba','G. Mathews','J. Christopher','J. Green', 
#   'J. Tate', 'K. Martin','K. Porter', 'U. Garuba'

# players in the model        
#   'James Harden', 'Trevor Ariza', 'Eric Gordon', 'Ryan Anderson', 'Patrick Beverly', 
#   'PJ Tucker', 'Clint Capela', 'Chris Paul', 'Russell Westbrook', 'Danuel House Jr.', 
#   'Ben McLemore', 'JaeSean Tate', 'Christian Wood', 'John Wall', 'Sterling Brown', 
#   'Kenyon Martin Jr.', 'Jalen Green', 'Kevin Porter Jr.', 'Garrison Matthews'

# Overlaps (4): 
# G.Mathews = Garrison Matthews (spelling error from another person)
# J. Green = Jalen Green
# J. Tate = JaeSean Tate
# K. Porter = Kevin Porter Jr.

In [None]:
# use best model to predict play-by-play outcome - show win/loss trajectory
