# How to Calculate Plus Minus (+/-) using NBA Play-by-Play Data

### Import Packages

In [1]:
import re
import numpy as np
import pandas as pd
pd.set_option('precision', 2)

### Import Data

In [2]:
hou_gsw = '[2017-10-17]-0021700002-HOU@GSW.csv'
bos_cle = '[2017-10-17]-0021700001-BOS@CLE.csv'

PbP_df = pd.read_csv('../data/2017-18_pbp/{}'.format(hou_gsw))
PbP_df.head()

Unnamed: 0,game_id,data_set,date,a1,a2,a3,a4,a5,h1,h2,...,reason,result,steal,type,shot_distance,original_x,original_y,converted_x,converted_y,description
0,21700002,2017-2018 Regular Season,10/17/2017,Trevor Ariza,Ryan Anderson,Clint Capela,James Harden,Chris Paul,Kevin Durant,Draymond Green,...,,,,start of period,,,,,,
1,21700002,2017-2018 Regular Season,10/17/2017,Trevor Ariza,Ryan Anderson,Clint Capela,James Harden,Chris Paul,Kevin Durant,Draymond Green,...,,,,jump ball,,,,,,Jump Ball Pachulia vs. Capela: Tip to Anderson
2,21700002,2017-2018 Regular Season,10/17/2017,Trevor Ariza,Ryan Anderson,Clint Capela,James Harden,Chris Paul,Kevin Durant,Draymond Green,...,,made,,unknown,2.0,-10.0,16.0,26.0,6.6,Harden 2' Driving Layup (2 PTS)
3,21700002,2017-2018 Regular Season,10/17/2017,Trevor Ariza,Ryan Anderson,Clint Capela,James Harden,Chris Paul,Kevin Durant,Draymond Green,...,,missed,,Jump Shot,25.0,9.0,254.0,25.9,63.6,MISS Green 25' 3PT Jump Shot
4,21700002,2017-2018 Regular Season,10/17/2017,Trevor Ariza,Ryan Anderson,Clint Capela,James Harden,Chris Paul,Kevin Durant,Draymond Green,...,,,,rebound defensive,,,,,,Capela REBOUND (Off:0 Def:1)


In [3]:
PbP_df.shape

(457, 44)

In [4]:
PbP_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457 entries, 0 to 456
Data columns (total 44 columns):
game_id           457 non-null int64
data_set          457 non-null object
date              457 non-null object
a1                457 non-null object
a2                457 non-null object
a3                457 non-null object
a4                457 non-null object
a5                457 non-null object
h1                457 non-null object
h2                457 non-null object
h3                457 non-null object
h4                457 non-null object
h5                457 non-null object
period            457 non-null int64
away_score        457 non-null int64
home_score        457 non-null int64
remaining_time    457 non-null object
elapsed           457 non-null object
play_length       457 non-null object
play_id           457 non-null int64
team              422 non-null object
event_type        457 non-null object
assist            62 non-null object
away              1 non-nul

### Data Cleaning

##### Fill the Null Values

In [5]:
# Only filling the null values in points for +/-
PbP_df.points = PbP_df.points.fillna(0)

##### Create the Dummy Player Columns

The data for players on court are contained in the columns: ['a1', 'a2', 'a3', 'a4', 'a5', 'h1','h2', 'h3', 'h4', 'h5']. This makes it difficult to understand which players are active since any one of the 5 player positions can contain one of the player names. Instead, we will create dummy variables from the player columns and signify their status on court with a '1' or on the bench with a '0'.

In [6]:
# Create dummy variables for the player columns
player_dummy_df = pd.get_dummies(PbP_df.filter(regex='a[1-5]|h[1-5]'), prefix='player')

# Remove the whitespace in the dummy player columns
player_dummy_df.columns = [x.strip().replace(' ', '_') for x in player_dummy_df.columns]

# Collapse the duplicate dummy player columns and sum the column values
player_dummy_df = player_dummy_df.groupby(lambda x:x, axis=1).sum()

# Bring the dummy columns into the main dataframe
PbP_df = pd.concat([PbP_df, player_dummy_df], axis=1)

##### Example

In [7]:
PbP_df.head()

Unnamed: 0,game_id,data_set,date,a1,a2,a3,a4,a5,h1,h2,...,player_Luc_Mbah_a_Moute,player_Nick_Young,player_Omri_Casspi,player_PJ_Tucker,player_Patrick_McCaw,player_Ryan_Anderson,player_Shaun_Livingston,player_Stephen_Curry,player_Trevor_Ariza,player_Zaza_Pachulia
0,21700002,2017-2018 Regular Season,10/17/2017,Trevor Ariza,Ryan Anderson,Clint Capela,James Harden,Chris Paul,Kevin Durant,Draymond Green,...,0,0,0,0,0,1,0,1,1,1
1,21700002,2017-2018 Regular Season,10/17/2017,Trevor Ariza,Ryan Anderson,Clint Capela,James Harden,Chris Paul,Kevin Durant,Draymond Green,...,0,0,0,0,0,1,0,1,1,1
2,21700002,2017-2018 Regular Season,10/17/2017,Trevor Ariza,Ryan Anderson,Clint Capela,James Harden,Chris Paul,Kevin Durant,Draymond Green,...,0,0,0,0,0,1,0,1,1,1
3,21700002,2017-2018 Regular Season,10/17/2017,Trevor Ariza,Ryan Anderson,Clint Capela,James Harden,Chris Paul,Kevin Durant,Draymond Green,...,0,0,0,0,0,1,0,1,1,1
4,21700002,2017-2018 Regular Season,10/17/2017,Trevor Ariza,Ryan Anderson,Clint Capela,James Harden,Chris Paul,Kevin Durant,Draymond Green,...,0,0,0,0,0,1,0,1,1,1


In [8]:
# Get list of players
player_list = [i for i in list(PbP_df.columns) if re.search(r'player_.+_.+', i)]
player_list

['player_Chris_Paul',
 'player_Clint_Capela',
 'player_David_West',
 'player_Draymond_Green',
 'player_Eric_Gordon',
 'player_James_Harden',
 'player_Jordan_Bell',
 'player_Kevin_Durant',
 'player_Kevon_Looney',
 'player_Klay_Thompson',
 'player_Luc_Mbah_a_Moute',
 'player_Nick_Young',
 'player_Omri_Casspi',
 'player_PJ_Tucker',
 'player_Patrick_McCaw',
 'player_Ryan_Anderson',
 'player_Shaun_Livingston',
 'player_Stephen_Curry',
 'player_Trevor_Ariza',
 'player_Zaza_Pachulia']

### Calculate Plus Minus (+/-)

**How to:** For a given player, count up points scored by the player’s team and points scored against the player’s team when that player is on the floor. Then simply subtract points against from points for. 

#### Step 1: Initialize a Dataframe with Player Information

In [9]:
# Create the main +/- dataframe
player_df = pd.DataFrame(columns=['player'])
player_df['player_unformatted'] = [i for i in player_list]

# Format the 'player_unformatted' column to merge later to get the team column
player_df['player'] = [i.replace('player_','').replace('_',' ') for i in player_list]

# Merge with the PbP dataframe to get team information
player_df = player_df.merge(PbP_df[['team', 'player']], on='player').drop_duplicates()
player_df = player_df.reset_index(drop=True)
player_df.head()

Unnamed: 0,player,player_unformatted,team
0,Chris Paul,player_Chris_Paul,HOU
1,Clint Capela,player_Clint_Capela,HOU
2,David West,player_David_West,GSW
3,Draymond Green,player_Draymond_Green,GSW
4,Eric Gordon,player_Eric_Gordon,HOU


#### Step 2: Create a For Loop to store the points team/opposing team scored while player is on the floor

In [10]:
# Create a list of players and their respective team for the below for loop
p_list = list(player_df.player_unformatted)
t_list = list(player_df.team)

In [11]:
plus = []
minus = []

for player, team in zip(p_list, t_list):
    p_nested = []
    plus.append(p_nested)
    
    m_nested = []
    minus.append(m_nested)
    
    for i, row in PbP_df.iterrows():
        if (row[player] == 1) & (row['team'] == team):
            p_nested.append(row['points'])
        elif (row[player] == 1) & (row['team'] != team):
            m_nested.append(row['points'])

##### Example

In [12]:
# The plus variable contains a list of lists, which are the points the team scored when the player is active on the court
print(plus[0])
print()

# The minus variable contains a list of lists, which are the points the opposing team scored when the player is active on the court
print(minus[0])

[0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 3.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 3.0, 0.0, 2.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 2.0, 3.0, 0.0, 3.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 3.0, 0.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 0.0, 0.0, 2.0]

[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 3.0, 2.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 3.0, 0.0, 0.0, 2.0, 2.0, 0.0, 3.0, 0.0, 1.0, 1.0, 3.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 2.0, 1.0, 2.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.

#### Step 3: Create another For Loop to sum the +/- values into separate lists

In [13]:
p_list = []
m_list = []


for value in plus:
    value = np.sum(value)
    p_list.append(value)
    
for value in minus:
    value = np.sum(value)
    m_list.append(value)

##### Example

In [14]:
# Here we will sum each list and get a list of the + values
print(p_list[0])
print()

# Here is a sum of each list to get a list of the - values
print(m_list[0])

81.0

94.0


#### Step 4: Create the Final Dataframe

In [15]:
# Add the columns to the final dataframe
player_df['plus'] = p_list
player_df['minus'] = m_list

# Create the +/- column by subtracting the minus from plus columns
player_df['+/-'] = player_df['plus'] - player_df['minus']

# Reformat the dataframe for visuals
player_df.drop(['plus', 'minus', 'player_unformatted'], axis=1, inplace=True)
player_df = player_df.sort_values(by=['team', 'player'], ascending=True)
player_df = player_df.reset_index(drop=True)
player_df

Unnamed: 0,player,team,+/-
0,David West,GSW,0.0
1,Draymond Green,GSW,7.0
2,Jordan Bell,GSW,-3.0
3,Kevin Durant,GSW,11.0
4,Kevon Looney,GSW,-7.0
5,Klay Thompson,GSW,1.0
6,Nick Young,GSW,-8.0
7,Omri Casspi,GSW,-4.0
8,Patrick McCaw,GSW,-9.0
9,Shaun Livingston,GSW,-7.0
