In [1]:
import boto3
import pandas as pd
import numpy as np
from time import sleep
import tensorflow as tf
from nba_api.stats.endpoints import leaguegamelog, playbyplayv2
dynamoDB = boto3.resource('dynamodb', region_name = 'us-east-2')
game_log_db = dynamoDB.Table('game_log')

In [2]:
from datetime import datetime
from nba_api.stats.endpoints import playbyplayv2


def str_to_time(str1):
    time_ = datetime.strptime(str1, "%M:%S")
    return time_.second + time_.minute*60


def home_poss(d):
    if (d['home_true'] == 1) & (d['visitor_true']==0):
        return 1
    elif (d['home_true'] == 0) & (d['visitor_true']==1):
        return 0
    else:
        if d['block'] or d['steal']:
            return 1
        else:
            return 0
        
def find_seconds_left(x):
    if x == 1:
        return 3*720
    elif x == 2: 
        return 2*720
    elif x == 3:
        return 720
    else:
        return 0
    
def load_game(game_id):
    pbp = playbyplayv2.PlayByPlayV2(game_id).get_data_frames()[0]
    home_team_name = pbp['PLAYER1_TEAM_ABBREVIATION'].dropna().iloc[0]
    return pbp, home_team_name

def feature_engineer(df):
    pbp = df.copy()
    pbp[['home_true', 'visitor_true']] = pbp[['HOMEDESCRIPTION','VISITORDESCRIPTION']].notnull().astype(int)
    pbp['block'] = pbp['HOMEDESCRIPTION'].str.contains("BLOCK").fillna(False)
    pbp['steal'] = pbp['HOMEDESCRIPTION'].str.contains("STEAL").fillna(False)

    pbp['home_poss'] = pbp.apply(home_poss, axis = 1)
    pbp['diff'] = pbp['SCOREMARGIN'].ffill().fillna(0).replace({'TIE':0}).astype(int)
    pbp['OT_ind'] = (pbp['PERIOD']-4).clip(lower=0)

    pbp['seconds'] = pbp['PCTIMESTRING'].apply(str_to_time)
    pbp['seconds_left_in_game_from_quarter'] = pbp['PERIOD'].apply(find_seconds_left)
    pbp['time_remaining'] = pbp['seconds'] + pbp['seconds_left_in_game_from_quarter']

    game = pbp[['GAME_ID', 'home_poss', 'diff', 'time_remaining', 'OT_ind']]
    
    return game

In [3]:
import requests
import io

def read_url_to_csv(url):
    r = requests.get(url)
    data = io.StringIO(r.text)
    df = pd.read_csv(data, sep=",")
    return df

elo_url = 'https://projects.fivethirtyeight.com/nba-model/nba_elo.csv'

latest_raptor_url = 'https://projects.fivethirtyeight.com/nba-model/2022/latest_RAPTOR_by_team.csv'

In [4]:
def load_all_game_log():
    response = game_log_db.scan()
    result = response['Items']

    while 'LastEvaluatedKey' in response:
        response = game_log_db.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
        result.extend(response['Items'])
    return pd.DataFrame(result)

def process_new_games(new_games):
    new_games[['Home', 'Away']] = new_games['MATCHUP'].str.split('vs.', expand=True)
    new_games.loc[:, 'Home'] = new_games['Home'].str.strip()
    new_games.loc[:, 'home_team_win'] = new_games['WL'].replace({'W':1, 'L':0})
    new_games = new_games.dropna(subset=['home_team_win'])
    return new_games

def preprocess_elo(elo_df):
    elo_df = elo_df[elo_df['date'] > '2012-01-01']
    elo_df.loc[:, 'elo_difference'] = np.abs(elo_df['elo1_pre'] - elo_df['elo2_pre'])
    elo_df = elo_df[['date', 'team1', 'elo1_pre', 'elo2_pre', 'elo_difference']]
    elo_df['team1'] = elo_df['team1'].replace({'BRK':'BKN', 'PHO':'PHX', 'CHO':'CHA',})
    return elo_df

In [5]:
def update_game_log():
    print('getting nba_game log')
    gl = load_all_game_log()
    lgl = leaguegamelog.LeagueGameLog().get_data_frames()[0].astype({'GAME_ID':int})

    games_to_update = set(lgl['GAME_ID']).difference(set(gl['GAME_ID']))
    
    if games_to_update:
        print(f'there are {len(games_to_update)} games to update')

        games_to_update_df = lgl[(lgl['GAME_ID'].isin(games_to_update))&\
            (~lgl['MATCHUP'].str.contains('@'))]
        
        games_to_update_df = process_new_games(games_to_update_df)
        print('finished preprocessing the game log')

        elo = read_url_to_csv(elo_url)
        elo = preprocess_elo(elo)
        print('preprocessed elo')

        df = games_to_update_df.merge(elo, 
                             left_on=['GAME_DATE', 'Home'], 
                             right_on = ['date', 'team1'])[['GAME_ID', 'elo1_pre', 'elo2_pre', 'home_team_win', 'MATCHUP']]
        
        print('starting to get play by play for remaining games')
        pbps = []
        for g_id in df['GAME_ID'].unique():
            pbps.append(playbyplayv2.PlayByPlayV2(f'00{g_id}').get_data_frames()[0])
            sleep(0.5)
        if len(pbps) == 0:
            return

        print('merge all play by plays into one game log')
        pbp_df = pd.concat(pbps).astype({'GAME_ID':int})

        print('create our features for each game')
        pbp_df_prepped = pbp_df.groupby('GAME_ID').apply(feature_engineer).reset_index(drop=True)
        pbp_df_prepped['PLAY_NUMBER'] = pbp_df_prepped.groupby('GAME_ID').cumcount()

        print('merge our game log with elo and feature engineered play by play data')
        final_df = pbp_df_prepped.merge(df, on=['GAME_ID'])

        wEloCols = ['home_poss', 'diff', 'time_remaining', 'OT_ind', 'elo1_pre', 'elo2_pre']
        wOEloCols = ['home_poss', 'diff', 'time_remaining', 'OT_ind']

        print('run model to get predictions')
        model = tf.keras.models.load_model('../app/src/Models/TF_model_w_elo.h5')
        model_wO_elo = tf.keras.models.load_model('../app/src/Models/TF_model_wO_elo.h5')

        final_df.loc[:, 'preds_w_elo'] = model.predict_on_batch(final_df[wEloCols])
        final_df.loc[:, 'preds_wO_elo'] = model_wO_elo.predict_on_batch(final_df[wOEloCols])

        return final_df, games_to_update_df
        print('Games Updated!')

    else:
        print('Games didnt Update')

In [6]:
final, updated_games = update_game_log()

getting nba_game log
there are 54 games to update


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_games[['Home', 'Away']] = new_games['MATCHUP'].str.split('vs.', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_games[['Home', 'Away']] = new_games['MATCHUP'].str.split('vs.', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_games.loc[:, 'Home'] = new_games['Home'

finished preprocessing the game log


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elo_df.loc[:, 'elo_difference'] = np.abs(elo_df['elo1_pre'] - elo_df['elo2_pre'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elo_df['team1'] = elo_df['team1'].replace({'BRK':'BKN', 'PHO':'PHX', 'CHO':'CHA',})


preprocessed elo
starting to get play by play for remaining games
merge all play by plays into one game log
create our features for each game
merge our game log with elo and feature engineered play by play data
run model to get predictions


2022-01-27 16:23:04.811261: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
final.head()

Unnamed: 0,GAME_ID,home_poss,diff,time_remaining,OT_ind,PLAY_NUMBER,elo1_pre,elo2_pre,home_team_win,MATCHUP,preds_w_elo,preds_wO_elo
0,22100453,0,0,2880,0,0,1535.339426,1538.655104,0,BKN vs. DEN,0.599573,0.562178
1,22100453,1,0,2880,0,1,1535.339426,1538.655104,0,BKN vs. DEN,0.597405,0.562178
2,22100453,1,2,2860,0,2,1535.339426,1538.655104,0,BKN vs. DEN,0.635728,0.578853
3,22100453,0,-1,2839,0,3,1535.339426,1538.655104,0,BKN vs. DEN,0.579229,0.562178
4,22100453,1,-1,2823,0,4,1535.339426,1538.655104,0,BKN vs. DEN,0.576801,0.562178


In [14]:
updated_games.head()

Unnamed: 0,SEASON_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_NAME,GAME_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,...,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,Home,Away,home_team_win
1262,22021,1610612749,MIL,Milwaukee Bucks,22100641,2022-01-15,MIL vs. TOR,L,240,26,...,8,8,13,23,96,-7,1,MIL,TOR,0
1264,22021,1610612764,WAS,Washington Wizards,22100642,2022-01-15,WAS vs. POR,L,240,39,...,4,3,20,26,110,-5,1,WAS,POR,0
1304,22021,1610612737,ATL,Atlanta Hawks,22100660,2022-01-17,ATL vs. MIL,W,240,38,...,7,8,13,18,121,7,1,ATL,MIL,1
1345,22021,1610612752,NYK,New York Knicks,22100681,2022-01-20,NYK vs. NOP,L,240,29,...,6,5,15,21,91,-11,1,NYK,NOP,0
1346,22021,1610612742,DAL,Dallas Mavericks,22100682,2022-01-20,DAL vs. PHX,L,240,37,...,1,3,19,22,101,-8,1,DAL,PHX,0


In [18]:
import json
from decimal import Decimal

In [21]:
def upload_data_to_dynamoDB(df, db, check_col):
    df_dict = df[~df[check_col].isna()].to_dict('records')
    df_json = [json.loads(json.dumps(item), parse_float=Decimal) for item in df_dict]
    with db.batch_writer() as batch:
        for i in range(len(df_json)):
            batch.put_item(Item = df_json[i])
            
    print('DONE!')

In [22]:
upload_data_to_dynamoDB(updated_games, game_log_db, 'home_team_win')

DONE!


In [30]:
historical_pbp_modelled_db = dynamoDB.Table('historical_pbp_modelled')

In [33]:
upload_data_to_dynamoDB(final, historical_pbp_modelled_db, 'home_team_win')

DONE!


In [34]:
len(final)

25251

In [36]:
3598846 + 25251

3624097

In [23]:
def load_all_game_log():
    response = game_log_db.scan()
    result = response['Items']

    while 'LastEvaluatedKey' in response:
        response = game_log_db.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
        result.extend(response['Items'])
    return pd.DataFrame(result)

In [24]:
l = load_all_game_log()

In [25]:
len(l)

7782

In [None]:
'https://aws.amazon.com/getting-started/projects/create-manage-nonrelational-database-dynamodb/4/'

In [None]:
dynamoDB_client = boto3.client('dynamodb', region_name = 'us-east-2')

resp = dynamoDB_client.update_table(
    TableName="game_log",
    # Any attributes used in your new global secondary index must be declared in AttributeDefinitions
    AttributeDefinitions=[
        {
            "AttributeName": "GAME_ID",
            "AttributeType": "N"
        },
        {
            "AttributeName": "GAME_DATE",
            "AttributeType": "S"
        }
    ],
    # This is where you add, update, or delete any global secondary indexes on your table.
    GlobalSecondaryIndexUpdates=[
        {
            "Create": {
                # You need to name your index and specifically refer to it when using it for queries.
                "IndexName": "GameIdIndex",
                # Like the table itself, you need to specify the key schema for an index.
                # For a global secondary index, you can use a simple or composite key schema.
                "KeySchema": [
                    {
                        "AttributeName": "GAME_ID",
                        "KeyType": "HASH"
                    },
                    {
                        "AttributeName": "GAME_DATE",
                        "KeyType": "RANGE"
                    }
                ],
                # You can choose to copy only specific attributes from the original item into the index.
                # You might want to copy only a few attributes to save space.
                "Projection": {
                    "ProjectionType": "ALL"
                }
            }
        }
    ],
)

from boto3.dynamodb.conditions import Key

game_log_db.query(IndexName="GameIdIndex",
                  KeyConditionExpression=Key('GAME_ID').eq(22100540))

