# Preprocessing
In this notebook we walk through the process to convert the raw XY SportVU data into some concrete numbers,
we go through each game's data and extract for each shot:

## Sifting Through Tracking Data

We want to find the following features for each shot taken.

final_shot_defense_distance, 
start_shot_defense_distance, 
final_shot_distance,
shooter_id,
defender_id

We then save this to a csv file for later usage

In [1]:
from tracking.tracking import get_shot_time_from_range, find_player_with_ball
from tracking.plotting import get_plot_from_range
from tracking.utils import get_total_time

import pandas as pd
import numpy as np
import json
%matplotlib inline
import matplotlib.pyplot as plt
import math
from tqdm.auto import tqdm
import os

from scipy.signal import savgol_filter

In [2]:
TRACKING_DIRECTORY = "/home/avyayv/data/nba/rawtrackingfiles/data/csv/"
PBP_DIRECTORY = "/home/avyayv/data/nba/playbyplay/py_ball/"

In [3]:
def get_distance_to_hoop(shooter_x, shooter_y):
    """
    Input: shooter_x, shooter_y
    Returns: Distance from shooter to hoop in feet
    """
    if shooter_x > 45:
        hoop_x = 85
    else:
        hoop_x = 5
    a = ((shooter_x-hoop_x)**2)+((shooter_y-25)**2)
    return math.sqrt(a)

In [4]:
def get_train_example(game, shooter_id, defender_id, start, end):
    
    """
    Input: game(game tracking dataframe), shooter_id, defender_id, start(start timestamp), end(end timestamp)
    Returns: Individual array to be fed in as training sample
    """
    
    short_time_frame = game.loc[(game['total_time'] < start) & (game['total_time'] > end)]
    short_time_frame = short_time_frame.sort_values("total_time", ascending=False)
    
    shooter_df = short_time_frame.loc[short_time_frame['player_id'] == shooter_id].drop_duplicates("total_time").iloc[0:23]
    defender_df = short_time_frame.loc[short_time_frame['player_id'] == defender_id].drop_duplicates("total_time").iloc[0:23]
    
    shooter_x, shooter_y = list(shooter_df['x_loc']), list(shooter_df['y_loc'])
    defender_x, defender_y = list(defender_df['x_loc']), list(defender_df['y_loc'])

    columns = ["ShooterX", "ShooterY", "DefenderX", "DefenderY"]
    
    frame = pd.DataFrame(columns=columns)
    frame['ShooterX'] = shooter_x
    frame['ShooterY'] = shooter_y
    frame['DefenderX'] = defender_x
    frame['DefenderY'] = defender_y
    
    difference = {}
    difference['ShotDefenseX'] = (frame['ShooterX']-frame['DefenderX'])**2
    difference['ShotDefenseY'] = (frame['ShooterY']-frame['DefenderY'])**2
    difference['ShotDistanceX'] = (frame['ShooterX']-frame['DefenderX'])**2
    difference['ShotDistanceY'] = (frame['ShooterY']-frame['DefenderY'])**2
    difference['ShotDefenseDistance'] = np.sqrt(difference['ShotDefenseX']+difference['ShotDefenseY'])
    
    smoothed_data = savgol_filter(difference['ShotDefenseDistance'], 11, 3)
    frame['ShotDefenseDistance'] = smoothed_data
    frame['ShotDistance'] = np.vectorize(get_distance_to_hoop)(frame['ShooterX'], frame['ShooterY'])

    final_shot_defense_distance = frame['ShotDefenseDistance'].iloc[-1]
    start_shot_defense_distance = frame['ShotDefenseDistance'].iloc[0]
    final_shot_distance = frame['ShotDistance'].iloc[-1]
    
    if final_shot_defense_distance-start_shot_defense_distance < 12: #ensuring data isn't an artifact of sorts
        return [final_shot_defense_distance, start_shot_defense_distance, final_shot_distance]
    else:
        return [None]

In [8]:
def get_all_train():
    """
    Returns: X and y, the raw data that can be fed into the XGBoost Model
    """
    X = []
    y = []
    for i, filename in enumerate(tqdm(os.listdir(TRACKING_DIRECTORY))):
        try:
            game = pd.read_csv(TRACKING_DIRECTORY+filename)
            game_json = filename.split(".")[0]+'.json'
            pbp = pd.DataFrame(json.loads(open(PBP_DIRECTORY+game_json).read())['PlayByPlay'])
            misses = pbp.loc[pbp['EVENTMSGTYPE'] == 2].copy()
            makes = pbp.loc[pbp['EVENTMSGTYPE'] == 1].copy()
            misses['total_time'] = np.vectorize(get_total_time)(misses['PERIOD'], misses['PCTIMESTRING'], pbp=True)
            makes['total_time'] = np.vectorize(get_total_time)(makes['PERIOD'], makes['PCTIMESTRING'], pbp=True)
            game['total_time'] = np.vectorize(get_total_time)(game['quarter'], game['game_clock'])

            for row in makes['total_time']:
                try:
                    time = get_shot_time_from_range(game, row+5, row-5)
                    shooter, defender = find_player_with_ball(game, time)
                    example = (get_train_example(game, shooter['player_id'], defender['player_id'], time, time-1))
                    if example[0] != None:
                        example.extend([shooter['player_id'], defender['player_id']])
                        X.append(example)
                        y.append(1)
                except:
                    continue

            for row in misses['total_time']:
                try:
                    time = get_shot_time_from_range(game, row+5, row-5)
                    shooter, defender = find_player_with_ball(game, time)
                    example = (get_train_example(game, shooter['player_id'], defender['player_id'], time, time-1))
                    if example[0] != None:
                        example.extend([shooter['player_id'], defender['player_id']])
                        X.append(example)
                        y.append(0)
                except:
                    continue
        except:
            continue
            
    return X, y

In [None]:
X, y = get_all_train() #save data

HBox(children=(IntProgress(value=0, max=632), HTML(value='')))

In [None]:
pd.DataFrame(X).to_csv("trainXAll.csv", index=False)
pd.DataFrame(y).to_csv("trainYAll.csv", index=False)

## Getting Bio and percentage stats
We also would like to feed in height differential and 3pt% into the model

In [None]:
"""
Send request to NBA API so that it seems like this is a browser request from stats.nba.com
"""
headers = {
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.8,ru;q=0.6',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Referer': 'https://stats.nba.com/teams/boxscores-traditional/',
    'x-nba-stats-origin': 'stats',
    'x-nba-stats-token': 'true'
}

In [None]:
import requests
bio_request = requests.get("https://stats.nba.com/stats/leaguedashplayerbiostats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&Season=2015-16&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=", headers=headers)
to_csv_bio = pd.DataFrame(bio_request.json()['resultSets'][0]['rowSet'], columns=bio_request.json()['resultSets'][0]['headers'])
to_csv_bio.to_csv("bio_data.csv")
summary_request = requests.get("https://stats.nba.com/stats/leaguedashplayerstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&GameSegment=&Height=&LastNGames=0&LeagueID=00&Location=&MeasureType=Base&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PaceAdjust=N&PerMode=PerGame&Period=0&PlayerExperience=&PlayerPosition=&PlusMinus=N&Rank=N&Season=2015-16&SeasonSegment=&SeasonType=Regular+Season&ShotClockRange=&StarterBench=&TeamID=0&TwoWay=0&VsConference=&VsDivision=&Weight=", headers=headers)
to_csv_summary = pd.DataFrame(summary_request.json()['resultSets'][0]['rowSet'], columns=summary_request.json()['resultSets'][0]['headers'])
to_csv_summary.to_csv("summary_data.csv")