In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# pandas options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)

**Idea**: NBA game data.  See if certain officals have an impact on home team winning or not.

In [78]:
# nba officials data
officials_nba = pd.read_csv('data/nba_data/officials.csv')

# nba game data
games_nba = pd.read_csv('data/nba_data/game.csv')

In [79]:
# games 2010-2020
games_nba = games_nba[games_nba.game_date >= '2009-10-01']
games_nba = games_nba[games_nba.game_date < '2021-04-01']
# encode wl_home to 1 if home team won, 0 if home team lost
games_nba['wl_home'] = games_nba['wl_home'].apply(lambda x: 1 if x == 'W' else 0)

In [80]:
nbateams = ['Atlanta Hawks',
 'Boston Celtics',
 'Brooklyn Nets',
 'Charlotte Bobcats',
 'Chicago Bulls',
 'Cleveland Cavaliers',
 'Dallas Mavericks',
 'Denver Nuggets',
 'Detroit Pistons',
 'Golden State Warriors',
 'Houston Rockets',
 'Indiana Pacers',
 'Los Angeles Clippers',
 'Los Angeles Lakers',
 'Memphis Grizzlies',
 'Miami Heat',
 'Milwaukee Bucks',
 'Minnesota Timberwolves',
 'New Jersey Nets',
 'New Orleans Hornets',
 'New Orleans Pelicans',
 'New York Knicks',
 'Oklahoma City Thunder',
 'Orlando Magic',
 'Philadelphia 76ers',
 'Phoenix Suns',
 'Portland Trail Blazers',
 'Sacramento Kings',
 'San Antonio Spurs',
 'Toronto Raptors',
 'Utah Jazz',
 'Washington Wizards']

In [81]:
# keep only teams in the list
games_nba = games_nba[games_nba.team_name_home.isin(nbateams)]

In [82]:
# merge officials and games
games_officials_nba = officials_nba.merge(games_nba, left_on='game_id', right_on='game_id', how='inner')
games_officials_nba = games_officials_nba[games_officials_nba.season_type == 'Regular Season']

In [92]:
# get the number of games each official has officiated
officials_games = games_officials_nba.groupby('official_id').game_id.nunique().reset_index()
officials_games.columns = ['official_id', 'games']
officals_100_games_min = officials_games[officials_games.games > 100].official_id.values

# keep only officials with more than 100 games
games_officials_nba = games_officials_nba[games_officials_nba.official_id.isin(officals_100_games_min)]

In [93]:
# for each offical_id, create a new dataframe with all the games they officiated
# store in a dict with the official_id as the key
officials_games_dict = {}
for official_id in games_officials_nba.official_id.unique():
    officials_games_dict[official_id] = games_officials_nba[games_officials_nba.official_id == official_id]

In [94]:
# for offical_id 1179, get the ratio of home team wins
official_id = 1179
officials_games_dict[official_id].wl_home.mean()

0.6259842519685039

In [95]:
# we now create a dataframe with the home team win ratio for each official
officials_win_ratio = pd.DataFrame(columns=['official_id', 'win_ratio'])
for official_id in games_officials_nba.official_id.unique():
    win_ratio = officials_games_dict[official_id].wl_home.mean()
    new_row = pd.DataFrame({'official_id': [official_id], 'win_ratio': [win_ratio]})
    officials_win_ratio = pd.concat([officials_win_ratio, new_row], ignore_index=True)

In [98]:
# get the average win ratio for all games
games_officials_nba.wl_home.mean()

0.5838284034769557

In [113]:
# create a new dataframe that can be used to analyze team favoritism
team_favoritism = pd.DataFrame(columns=['official_id', 'team', 'win_ratio'])
for official_id in games_officials_nba.official_id.unique():
    for team in games_officials_nba.team_name_home.unique():
        win_ratio = games_officials_nba[(games_officials_nba.official_id == official_id) & (games_officials_nba.team_name_home == team)].wl_home.mean()
        number_of_games = games_officials_nba[(games_officials_nba.official_id == official_id) & (games_officials_nba.team_name_home == team)].shape[0]
        home_team_win_ratio = games_officials_nba[(games_officials_nba.team_name_home == team)].wl_home.mean()
        new_row = pd.DataFrame({'official_id': [official_id], 'team': [team], 'win_ratio': [win_ratio], 'number_of_games': [number_of_games], 'home_team_win_ratio': [home_team_win_ratio]})
        team_favoritism = pd.concat([team_favoritism, new_row], ignore_index=True)
team_favoritism = team_favoritism[team_favoritism.number_of_games > 10]

In [114]:
team_favoritism[team_favoritism.official_id == 1179]

Unnamed: 0,official_id,team,win_ratio,number_of_games,home_team_win_ratio
0,1179,Portland Trail Blazers,0.8,25.0,0.696237
1,1179,Cleveland Cavaliers,0.642857,14.0,0.566728
2,1179,Los Angeles Lakers,0.653846,26.0,0.531362
3,1179,Dallas Mavericks,0.533333,15.0,0.587922
4,1179,Atlanta Hawks,0.545455,22.0,0.591689
5,1179,Miami Heat,0.5,16.0,0.654265
6,1179,Golden State Warriors,0.681818,22.0,0.679396
7,1179,Denver Nuggets,0.708333,24.0,0.654955
8,1179,Los Angeles Clippers,0.846154,13.0,0.679775
9,1179,San Antonio Spurs,0.909091,11.0,0.767401
