In game Win Probability Modelling:
============================================
Part 1 : Data Preprocessing, Feature Engineering
---------------------------------------------------

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from collections import deque
import codecs
import math

In [2]:
# Setting display priorities, might cause security issues because of the raw html injection !
from IPython.core.display import display, HTML
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
display(HTML("<style>.container { width:80% !important; }</style>")) # comment for security issues
np.set_printoptions(precision=4)
pd.set_option('display.max_columns',50)

Data Preprocessing and Feature Engineering:
---------------------------------------------

Most of the features have to extracted from the data points by using regular expression.
A detailed list of final features and their description will be available in the file, `feature_dictionary.txt`  
The features are seperated into three families,  

1) **In game features** : Features that are extracted from the current data records and the records that occured before it  
2) **In season features** : Features that are extracted from previous games of the same regualar season  
3) **Pre Season features (External Data)** : External data from the previous year regular season performance. For 2016 I used 2014-2015 regular season hollinger stats  and for 2017 games I used 2015-2016 regualar season [hollinger stats](http://www.espn.com/nba/hollinger/teamstats)  
4) **Player features**: [Hollinger stats for players](http://insider.espn.com/nba/hollinger/statistics) is used in this segment. 


In [3]:
# Reading data
data = pd.read_csv("pbp.csv")
len(np.unique(data['game_id'].values))
data.shape

(294366, 18)

From the score field we extract `score_difference` and `score_diff_percentile`
and we convert the time to seconds as `remaining_time`

$$score\ difference = away\ score - home\ score $$
$$score\ difference\ percentile = \frac{away\ score  - home\ score}{away\ score + home\ score}$$

In [4]:
# Remaining Time in seconds
temp = data['play_clock'].str.extract("(\d+):(\d+)")
data_clean = data.copy()
data_clean['remaining_time'] = temp[0].astype('int32')*60 + temp[1].astype('int32')

# Score Difference between the teams
before = ''
score = data_clean['score']
for i in range(len(score)):
    if type(score[i]) is str:
        before = score[i]
    else:
        score.set_value(i, before)
        
temp = data_clean['score'].str.extract("(\d+) - (\d+)")
data_clean['score_difference'] = temp[0].astype('int32') - temp[1].astype('int32')
data_clean['score_diff_percentile'] = (temp[0].astype('float32') - temp[1].astype('int32'))/(temp[0].astype('int32') + temp[1].astype('int32'))
data_clean.head()

  from ipykernel import kernelapp as app


Unnamed: 0,sequence_id,game_id,period,play_clock,home_description,away_description,score,player1_id,player1_name,player1_team,player2_id,player2_name,player2_team,player3_id,player3_name,player3_team,event_type,event_description,remaining_time,score_difference,score_diff_percentile
0,0,21500001,4,12:00,,,82 - 66,0,,,0,,,0,,,Start Period,,720,16,0.108108
1,1,21500001,4,11:44,,Baynes Dunk (2 PTS) (Ilyasova 3 AST),84 - 66,203382,Aron Baynes,DET,101141,Ersan Ilyasova,DET,0,,,Made Shot,,704,18,0.12
2,2,21500001,4,11:29,Schroder 2' Driving Finger Roll Layup (16 PTS),,84 - 68,203471,Dennis Schroder,ATL,0,,,0,,,Made Shot,,689,16,0.105263
3,3,21500001,4,11:17,Schroder STEAL (2 STL),Blake Lost Ball Turnover (P3.T13),84 - 68,2581,Steve Blake,DET,203471,Dennis Schroder,ATL,0,,,Turnover,Lost Ball,677,16,0.105263
4,4,21500001,4,11:14,,Ilyasova S.FOUL (P3.T1) (Z.Zarba),84 - 68,101141,Ersan Ilyasova,DET,203471,Dennis Schroder,ATL,0,,,Foul,Shooting,674,16,0.105263


For each game we find the `home` and `away` team from home and away description  
and using the event type "end game" we set the `home_result` to 0 or 1.  
These data is then propogated to all records of the particular game so that each point can act as  
seperate data point for modelling.

In [None]:
# Initializing fields
data_clean['home_result'] = pd.Series(dtype='int32')
data_clean['home'] = pd.Series(dtype='object')
data_clean['away'] = pd.Series(dtype='object')

# Calculating the home, away team and home team result
game_set = dict()
home, away = '', ''
home_f, away_f = False, False
for i in range(data_clean.shape[0]):
    game_id = data_clean['game_id'][i]
    if game_id not in game_set and game_id not in [21500586]:
        name = data_clean['player1_name'][i]
        if type(name) is str:
            name = name.split(' ')
            name = name[len(name) - 1]
            if name in data_clean['home_description'] and name in data_clean['away_description']:
                print(game_id)
                game_set[game_id] = ('Locked')
            elif not home_f and type(data_clean['home_description'][i]) is str:
                if name in data_clean['home_description'][i]:
                    home = data_clean['player1_team'][i]
                    home_f = True                       
            elif not away_f and type(data_clean['away_description'][i]) is str:
                if name in data_clean['away_description'][i]:
                    away = data_clean['player1_team'][i]
                    away_f = True
        if home_f and away_f and data_clean['remaining_time'][i] == 0 and away != home:
            match = re.match(r'(\d+) - (\d+)',data_clean['score'][i])
            away_score, home_score = int(match.group(1)), int(match.group(2))
            home_result = 0 if away_score > home_score else 1
            print(game_id, home, away, home_score, away_score, home_result)
            home_f, away_f = False, False
            game_set[game_id] = (home, away, home_score, away_score, home_result)

In [6]:
# A rare game where a player with a last name 'Johnson' appears in both away and home description 
# making it tough to find home and away team. Manually adding the record as a workaround
game_set[21500586] = ('LAC', 'MIA', 104, 90, 1)

In [7]:
# Propogating the information to all the records
for i in range((data_clean.shape[0])):
    record = game_set[data_clean['game_id'][i]]
    data_clean['home'].set_value(i, record[0])
    data_clean['away'].set_value(i, record[1])
    data_clean['home_result'].set_value(i, record[4])

In game features are created by looking at the records that appeared before it in the same game.
Features like `in_violations`, `in_rebound`, `in_turnover` are calculated as  

    in_feature = home_feature - away_feature  
    
If the feature is violation then we subtract the home_violations so far minus away    violations so far. This is better than having a seperate feature for each team. It keeps the    data dense and reduces the dimensionality as well

    `in_shot_accuracy` is calculated as follows,
$$ in\ shot\ accuracy = \frac{home\ Made\ Shot}{(home\ Made\ Shot)+(home\ Missed\ Shot} - \frac{away\ Made\ Shot}{(away\ Made\ Shot)+(away\ Missed\ Shot)}$$

In [8]:
# Creating fields for In game features
data_clean['in_violations'] = pd.Series(dtype='int32')
data_clean['in_rebound'] = pd.Series(dtype='int32')
data_clean['in_hfouls'] = pd.Series(dtype='int32')
data_clean['in_hshoot_fouls'] = pd.Series(dtype='int32')
data_clean['in_afouls'] = pd.Series(dtype='int32')
data_clean['in_ashoot_fouls'] = pd.Series(dtype='int32')
data_clean['in_turnover'] = pd.Series(dtype='int32')
data_clean['in_shot_accuracy'] = pd.Series(dtype='int32')
data_clean['in_ft_accuracy'] = pd.Series(dtype='int32')
data_clean['in_momentum'] = pd.Series(dtype='int32')

In [9]:
data_clean['event_type'] = data_clean['event_type'].str.strip()
game_after = 0
for i in range(data_clean.shape[0]):
    game_before = data_clean['game_id'][i]
    if game_before != game_after:
        in_violations, in_rebound, in_fouls, in_turnover, in_shoot_fouls, in_shot_accuracy, in_ft_accuracy = 0, 0, 0, 0, 0, 0, 0
        made_home, missed_home, made_away, missed_away, home_viol, away_viol, home_rebound, away_rebound = 0, 0, 0, 0, 0, 0, 0, 0
        home_sfoul, away_sfoul, home_foul, away_foul, home_ft_success, away_ft_success, home_ft_fail, away_ft_fail = 0, 0, 0, 0, 0, 0, 0, 0
        home_to, away_to = 0, 0
        start_diff = data_clean['game_id'][i]
        game_after = game_before
        momentum = deque([0]*5, maxlen=5)
    else:
        event = data_clean['event_type'][i]
        event_desc = data_clean['event_description'][i]
        home_desc = '' if type(data_clean['home_description'][i]) is float else data_clean['home_description'][i]
        away_desc = '' if type(data_clean['away_description'][i]) is float else data_clean['away_description'][i]
        momentum.append(data_clean['score_difference'][i])
        
        if event == 'Made Shot':
            made_home = (made_home+1) if type(home_desc) is str else made_home
            made_away = (made_away+1) if type(away_desc) is str else made_away
        elif event == 'Missed Shot':
            missed_home = (missed_home+1) if 'miss' in home_desc.lower() else missed_home
            missed_away = (missed_away+1) if 'miss' in away_desc.lower() else missed_away
        elif event == 'Violation':
            home_viol = (home_viol+1) if 'Violation' in home_desc else home_viol
            away_viol = (away_viol+1) if 'Violation' in away_desc else away_viol
        elif event == 'Rebound':
            home_rebound = (home_rebound+1) if 'rebound' in home_desc.lower() else home_rebound
            away_rebound = (away_rebound+1) if 'rebound' in away_desc.lower() else away_rebound
        elif event == 'Turnover':
            home_to = (home_to+1) if 'turnover' in home_desc.lower() else home_to
            away_to = (away_to+1) if 'turnover' in away_desc.lower() else away_to
        elif event == 'Foul':
            if 'shooting' in event_desc.lower():
                home_sfoul = (home_sfoul+1) if 'foul' in home_desc.lower() else home_sfoul
                away_sfoul = (away_sfoul+1) if 'foul' in away_desc.lower() else away_sfoul
            else:
                home_foul = (home_foul+1) if 'foul' in home_desc.lower() else home_foul
                away_foul = (away_foul+1) if 'foul' in away_desc.lower() else away_foul
        elif event == 'Free Throw':
            if 'free throw' in home_desc.lower():
                if 'miss' in home_desc.lower():
                    home_ft_fail += 1
                else:
                    home_ft_success += 1
            elif 'free throw' in away_desc.lower():
                if 'miss' in away_desc.lower():
                    away_ft_fail += 1
                else:
                    away_ft_success += 1

        if (home_ft_fail == 0 and home_ft_success == 0):
            a = 0
        else:
            a = (home_ft_success/(home_ft_fail+home_ft_success))
        if (away_ft_fail == 0 and away_ft_success == 0):
            b = 0
        else:
            b = (away_ft_success/(away_ft_fail+away_ft_success))
        ft_accuracy =  a - b 
        data_clean['in_ft_accuracy'].set_value(i, ft_accuracy) 
        
        if (made_home == 0 and missed_home == 0):
            a = 0
        else:
            a = (made_home/(made_home+missed_home))
        if (made_away == 0 and missed_away == 0):
            b = 0
        else:
            b = (made_away/(missed_away+made_away))
        
        shot_accuracy =  a - b 
        data_clean['in_shot_accuracy'].set_value(i, shot_accuracy)
        
        if momentum[4] != 0 and momentum[0] != 0 and momentum[0] != momentum[4]:
            data_clean['in_momentum'].set_value(i, (momentum[4] - momentum[0])/(momentum[4] + momentum[0]))
        else:
            data_clean['in_momentum'].set_value(i, 0)
        data_clean['in_violations'].set_value(i, (home_viol - away_viol))
        data_clean['in_rebound'].set_value(i, (home_rebound - away_rebound))
        data_clean['in_turnover'].set_value(i, (home_to - away_to))
        data_clean['in_ashoot_fouls'].set_value(i, away_sfoul)
        data_clean['in_afouls'].set_value(i, away_foul)
        data_clean['in_hshoot_fouls'].set_value(i, home_sfoul)
        data_clean['in_hfouls'].set_value(i, home_foul)
        game_after = game_before




The data set has 1230 total games for 2016 season and 1150 total games for 2017 season

$$ nth\ game\ in\ season = \frac{Nth\ game\ of\ the\ season}{Total\ number\ of\ games\ in\ the\ season}$$

$$ Total\ win\ ratio = \frac{No\ of\ games\ won\ so\ far}{No\ of\ games\ played\ so\ far}$$

$$ Home\ Away\ win\ ratio = \frac{No\ of\ games\ won\ so\ far\ against\ away\ team}{No\ of\ games\ played\ so\ far\ against\ away\ team}$$

In [10]:
# Creating fields to store in-season features
data_clean['total_win_ratio'] = pd.Series(dtype='float32')
data_clean['home_away_win_ratio'] = pd.Series(dtype='float32')
data_clean['nth_game_season'] = pd.Series(dtype='float32')

# Converting game_set into dataframe for easier data manipulation
game_set = pd.DataFrame.from_dict(game_set, orient='index')
game_set.reset_index(inplace=True)
game_set.columns = ['game_id', 'home', 'away', 'home_score', 'away_Score', 'home_result']
game_set['total_win_ratio'] = pd.Series(dtype='float32')
game_set['home_away_win_ratio'] = pd.Series(dtype='float32')
game_set['nth_game_season'] = pd.Series(dtype='float32')
game_set.head()

Unnamed: 0,game_id,home,away,home_score,away_Score,home_result,total_win_ratio,home_away_win_ratio,nth_game_season
0,21500001,ATL,DET,94,106,0,,,
1,21500002,CHI,CLE,97,95,1,,,
2,21500003,GSW,NOP,111,95,1,,,
3,21500004,ORL,WAS,87,88,0,,,
4,21500005,BOS,PHI,112,95,1,,,


In [12]:
# Setting in-season features total_win_ratio and home_away_win ratio
year_before, year_after = 0, 2015
teams_list = np.unique(game_set['home']).tolist()
vs_stats = dict()
overall_stats = dict()

for i in range(game_set.shape[0]):
    year_before = 2015 if (str(game_set['game_id'][i])).startswith('215') else 2016
    if year_before != year_after:
        vs_stats = dict()
        overall_stats = dict()
        year_after = year_before
    game_id = game_set['game_id'][i]
    home = game_set['home'][i]
    away = game_set['away'][i]
    result = game_set['home_result'][i]
    m = re.match(r'\d{4}(\d{4})', str(game_id))
    season_game = int(m.group(1))
    team_list = [home, away]
    team_list.sort()

    if overall_stats.get(home) == None:
        overall_stats[home] = (0, 0)
    if overall_stats.get(away) == None:
        overall_stats[away] = (0, 0)
    hwin, htotal = overall_stats[home]
    awin, atotal = overall_stats[away]
    total_games = 1230
    
    game_set['nth_game_season'].set_value(i, (season_game/total_games))
    if htotal == 0:
        game_set['total_win_ratio'].set_value(i, 0)
    else:
        game_set['total_win_ratio'].set_value(i, (hwin / htotal))
    htotal, atotal = htotal + 1, atotal + 1
    if result == 1:
        hwin += 1
    else:
        awin += 1

    overall_stats[home] = (hwin, htotal)
    overall_stats[away] = (awin, atotal)
    key = team_list[0] + team_list[1]

    if vs_stats.get(key) == None:
        vs_stats[key] = (0, 0)
    hvwin, hvtotal = vs_stats[key]
    if hvtotal == 0:
        game_set['home_away_win_ratio'].set_value(i, 0)
    else:
        game_set['home_away_win_ratio'].set_value(i, hvwin / hvtotal)

    hvtotal += 1
    hvwin = (hvwin + 1) if result == 1 else hvwin
    vs_stats[key] = (hvwin, hvtotal)


In [13]:
# Copying game data from game_set to data_clean dataframe
for i in range(game_set.shape[0]):
    game_id = game_set['game_id'][i]
    total_win_ratio = game_set['total_win_ratio'][i]
    home_away_win_ratio = game_set['home_away_win_ratio'][i]
    nth_game_season = game_set['nth_game_season'][i]
    data_clean.ix[data_clean['game_id'] == game_id, 'total_win_ratio'] = total_win_ratio
    data_clean.ix[data_clean['game_id'] == game_id, 'home_away_win_ratio'] = home_away_win_ratio
    data_clean.ix[data_clean['game_id'] == game_id, 'nth_game_season'] = nth_game_season

[Hollinger team stats](http://www.espn.com/nba/hollinger/teamstats) for the regular season is used as the external data.
All the pre season features have a 'pre' prefix.  

All the external features are calculated as follows,
        
        pre_PACE = PACE of home team - PACE of away team
The same goes for all other features in this segment as well.

External features and their description,

        PACE = the number of possessions a team uses per game.
        ORR = Offensive Rebound Rate
        DRR = Defensive Rebound Rate
        EFF FG% = Effective Field Goal Percentage
        OFF EFF = Offensive Efficiency, Number of points a team scores per 100 possession
        DEF EFF = Defensive Efficiency, Number of points a team allows per 100 possession
.

    Assist Ratio,
$$AST = \frac{(Assists * 100)}{[(FGA + (FTA * 0.44) + Assists + Turnovers)]}$$

    Turnover Ratio,
$$TO = \frac{(Turnover * 100}{ [(FGA + (FTA x 0.44) + Assists + Turnovers]}$$

    Rebound Rate,
$$REBR = \frac{Rebounds * Team Minutes}{Player Minutes *( Team Rebounds + Opponent Rebounds)}$$

    True Shooting Percentage,
$$TS = \frac{Total Points * 50}{[(FGA + (FTA * 0.44)]}$$

In [14]:
# Read in the external features
external = pd.read_csv('team_ratings.csv')

def external_ratings(home, away, year):
    return (external[((external['TEAM_ABBR'] == home) & (external['YEAR'] == year))].values[0][3:] - external[((external['TEAM_ABBR'] == away) & (external['YEAR'] == year))].values[0][3:])


In [15]:
# Creating columns for external features
# pre means stats got before the game from last regular season

data_clean['pre_PACE'] = pd.Series(dtype='int32')
data_clean['pre_AST'] = pd.Series(dtype='int32')
data_clean['pre_TO'] = pd.Series(dtype='int32')
data_clean['pre_ORR'] = pd.Series(dtype='int32')
data_clean['pre_DRR'] = pd.Series(dtype='int32')
data_clean['pre_EFF_FG'] = pd.Series(dtype='int32')
data_clean['pre_TS'] = pd.Series(dtype='int32')
data_clean['pre_OFF_EFF'] = pd.Series(dtype='int32')
data_clean['pre_DEF_EFF'] = pd.Series(dtype='int32')

In [16]:
game_before, game_after = 0, 0
record = 0
for i in range(data_clean.shape[0]):
    game_before = data_clean['game_id'][i]
    if game_before != game_after:
        year = 2015 if (re.match(r'215\d\d\d\d\d', str(game_before)) != None) else 2016
        record = external_ratings(home=data_clean['home'][i], away=data_clean['away'][i], year=year)
        game_after = game_before
    data_clean['pre_PACE'].set_value(i, record[0])
    data_clean['pre_AST'].set_value(i, record[1])
    data_clean['pre_TO'].set_value(i, record[2])
    data_clean['pre_ORR'].set_value(i, record[3])
    data_clean['pre_DRR'].set_value(i, record[4])
    data_clean['pre_EFF_FG'].set_value(i, record[5])
    data_clean['pre_TS'].set_value(i, record[6])
    data_clean['pre_OFF_EFF'].set_value(i, record[7])
    data_clean['pre_DEF_EFF'].set_value(i, record[8])

The three player information are taken across all record and a dictionary if unique players are formed.
Two external data files are read into prating15 and prating16 to store in the player rating of season 2015 and 2016

In [17]:
# Grouping all player information
player_details = pd.DataFrame(data_clean.groupby(['player1_id', 'player1_name','player1_team']).size())
player_details = player_details.append(pd.DataFrame(data_clean.groupby(['player2_id', 'player2_name', 'player2_team']).size()))
player_details = player_details.append(pd.DataFrame(data_clean.groupby(['player3_id', 'player3_name', 'player3_team']).size()))
player_details.reset_index(inplace=True)
player_details.drop_duplicates(subset=['player_id'], inplace=True)
player_details.reset_index(drop=True, inplace=True)
player_details.columns = ['ID', 'PLAYER', 'TEAM', 'APPEARED']
player_details


In [19]:
#Reading external data files
with codecs.open("player_ratings_2015.csv", "r",encoding='utf-8', errors='ignore') as fdata:
    prating15 = pd.read_csv(fdata)

with codecs.open("player_ratings_2016.csv", "r",encoding='utf-8', errors='ignore') as fdata:
    prating16 = pd.read_csv(fdata)
    
merged15 = player_details.merge(prating14, on='PLAYER', how='outer')
merged16 = player_details.merge(prating15, on='PLAYER', how='outer')

In [20]:
def get_rank(pname, year):
    rank = 0
    if year == 2015 and merged15.PLAYER.isin([pname]).any():
        rank = merged14[merged15.PLAYER == pname].RK.values[0]
    elif year == 2016 and merged16.PLAYER.isin([pname]).any():
        rank = merged15[merged16.PLAYER == pname].RK.values[0]
    return 0 if math.isnan(rank) else rank

data_clean['home_rank'] = pd.Series(dtype='float32')
data_clean['away_rank'] = pd.Series(dtype='float32')

In [None]:
game_before, game_after = 0, 0
for i in range(data_clean.shape[0]):
    game_before = data_clean['game_id'][i]
    if game_before != game_after:
        print("Games loaded...{}".format(game_before), end='\r')
        home_rank, away_rank, home_cnt, away_cnt = 0, 0, 0, 0
        home, away = data_clean.home[i], data_clean.away[i]
        game_after = game_before
        home_dict, away_dict = dict(), dict()
    else:
        year = 2015 if (re.match(r'215\d\d\d\d\d', str(data_clean.game_id[i])) != None) else 2016
        event = data_clean['event_type'][i]
        if event == 'Substitution':
            playing = data_clean['player2_name'][i]
            bench = data_clean['player1_name'][i]
            sub_team = data_clean['player1_team'][i]
            playing_rank = get_rank(playing, year)
            bench_rank = get_rank(bench, year)
            dic = home_dict if sub_team == home else away_dict
            if playing_rank != 0:
                dic[playing] = playing_rank
            if bench_rank != 0 and bench in dic:
                del dic[bench]
        else:
            for k in range(1,4):
                name = "player" + str(k) + "_name"
                team = "player" + str(k) + "_team"
                pname = data_clean[name][i]
                pteam = data_clean[team][i]
                dic = home_dict if pteam == home else away_dict
                if pname not in dic:
                    rank = get_rank(pname, year)
                    if rank != 0:
                        dic[pname] = rank
        home_rank = sum([v for i,v in home_dict.items()])/max(len(home_dict), 1)
        away_rank = sum([v for i,v in away_dict.items()])/max(len(away_dict), 1)
        data_clean['home_rank'].set_value(i, home_rank)
        data_clean['away_rank'].set_value(i, away_rank)

In [None]:
data_model = data_clean.drop(labels=['sequence_id', 'period', 'play_clock', 'home_description', 'away_description', 'score', 'player1_id', 'player1_name', 'player1_team', 'player2_id', 'player2_name', 'player2_team', 'player3_id', 'player3_name','player3_team', 'event_type', 'home', 'away', 'event_description'], axis=1)
data_model.head()

In [None]:
data_model.replace([np.inf, -np.inf], np.nan, inplace=True)
data_model.fillna(value=0, inplace=True)
data_clean.columns

In [None]:
data_model.describe()

In [42]:
# Saving the file
filename = "data_clean.csv"
data_clean.to_csv(filename)
# filename = "data_model.csv"
# data_model.to_csv(filename)
    