In [None]:
# %load ./imports.py
# %load /Users/bartev/dev/github-bv/sporty/notebooks/imports.py

## Where am I
!echo $VIRTUAL_ENV

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# magics
%load_ext blackcellmagic
# start cell with `%%black` to format using `black`

%load_ext autoreload
# start cell with `%autoreload` to reload module
# https://ipython.org/ipython-doc/stable/config/extensions/autoreload.html

# reload all modules when running
%autoreload 2

In [29]:
# imports

import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import seaborn as sns

from importlib import reload
from pathlib import Path

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

# https://plotnine.readthedocs.io/en/stable/

import plotnine as p9
from plotnine import ggplot, aes, facet_wrap

from src.utils import lower_case_col_names, drop_suffix
import src.data.load_data as ld
from src.data.load_data import get_nba_game_team_points, load_nba, load_nba_games_dataset

In [100]:
nba_games = load_nba_games_dataset()
nba_games.head()



Unnamed: 0,game_date_est,game_id,season,home_team_wins,team_id,pts,fg_pct,ft_pct,fg3_pct,ast,reb,ha,wl,nickname,city,fgm,fga,fg3m,fg3a,ftm,fta
811,2018-09-28,11800002,2018,1,1610612738,97.0,0.346,0.727,0.191,12.0,56.0,away,L,Celtics,Boston,36.0,104.0,9.0,47.0,16.0,22.0
1671,2018-09-28,11800002,2018,1,1610612766,104.0,0.382,0.641,0.306,17.0,65.0,home,W,Hornets,Charlotte,34.0,89.0,11.0,36.0,25.0,39.0
164,2018-09-29,11800003,2018,1,1610612761,122.0,0.415,0.824,0.343,15.0,45.0,home,W,Raptors,Toronto,34.0,82.0,12.0,35.0,42.0,51.0
422,2018-09-29,11800003,2018,1,1610612757,104.0,0.42,0.72,0.387,25.0,46.0,away,L,Trail Blazers,Portland,37.0,88.0,12.0,31.0,18.0,25.0
55,2018-09-29,11800005,2018,0,1610612744,110.0,0.473,0.769,0.353,33.0,48.0,home,L,Warriors,Golden State,44.0,93.0,12.0,34.0,10.0,13.0


# Explore the dataset

## Qualitative vs Quantitative Data

* object: qualitative variable
* int64: quantitative and discrete (integer) (-2^63) - (2^63 - 1)
* float64: quantitative and continuous - real numbers (64 bit)

In [81]:
nba_games.dtypes

game_date_est      object
game_id             int64
season              int64
home_team_wins      int64
team_id             int64
pts               float64
fg_pct            float64
ft_pct            float64
fg3_pct           float64
ast               float64
reb               float64
ha                 object
wl                 object
nickname           object
city               object
fgm               float64
fga               float64
fg3m              float64
fg3a              float64
ftm               float64
fta               float64
dtype: object

## Convert a categorical variable to a dummy variable

`pd.get_dummies` creates new columns, and drops the original columns.

In [84]:
nba_games.head()

Unnamed: 0,game_date_est,game_id,season,home_team_wins,team_id,pts,fg_pct,ft_pct,fg3_pct,ast,reb,ha,wl,nickname,city,fgm,fga,fg3m,fg3a,ftm,fta
811,2018-09-28,11800002,2018,1,1610612738,97.0,0.346,0.727,0.191,12.0,56.0,away,L,Celtics,Boston,36.0,104.0,9.0,47.0,16.0,22.0
1671,2018-09-28,11800002,2018,1,1610612766,104.0,0.382,0.641,0.306,17.0,65.0,home,W,Hornets,Charlotte,34.0,89.0,11.0,36.0,25.0,39.0
164,2018-09-29,11800003,2018,1,1610612761,122.0,0.415,0.824,0.343,15.0,45.0,home,W,Raptors,Toronto,34.0,82.0,12.0,35.0,42.0,51.0
422,2018-09-29,11800003,2018,1,1610612757,104.0,0.42,0.72,0.387,25.0,46.0,away,L,Trail Blazers,Portland,37.0,88.0,12.0,31.0,18.0,25.0
55,2018-09-29,11800005,2018,0,1610612744,110.0,0.473,0.769,0.353,33.0,48.0,home,L,Warriors,Golden State,44.0,93.0,12.0,34.0,10.0,13.0


In [88]:
dummy = pd.get_dummies(nba_games, columns=['wl'])
dummy.columns

Index(['game_date_est', 'game_id', 'season', 'home_team_wins', 'team_id', 'pts', 'fg_pct', 'ft_pct', 'fg3_pct', 'ast', 'reb', 'ha', 'nickname', 'city', 'fgm', 'fga', 'fg3m', 'fg3a', 'ftm', 'fta', 'wl_L', 'wl_W'], dtype='object')

### 3 ways to merge `wl_W`

In [91]:
pd.concat([nba_games, dummy['wl_W']], axis=1).rename(columns={'wl_W': 'win'}).head()

Unnamed: 0,game_date_est,game_id,season,home_team_wins,team_id,pts,fg_pct,ft_pct,fg3_pct,ast,reb,ha,wl,nickname,city,fgm,fga,fg3m,fg3a,ftm,fta,win
811,2018-09-28,11800002,2018,1,1610612738,97.0,0.346,0.727,0.191,12.0,56.0,away,L,Celtics,Boston,36.0,104.0,9.0,47.0,16.0,22.0,0
1671,2018-09-28,11800002,2018,1,1610612766,104.0,0.382,0.641,0.306,17.0,65.0,home,W,Hornets,Charlotte,34.0,89.0,11.0,36.0,25.0,39.0,1
164,2018-09-29,11800003,2018,1,1610612761,122.0,0.415,0.824,0.343,15.0,45.0,home,W,Raptors,Toronto,34.0,82.0,12.0,35.0,42.0,51.0,1
422,2018-09-29,11800003,2018,1,1610612757,104.0,0.42,0.72,0.387,25.0,46.0,away,L,Trail Blazers,Portland,37.0,88.0,12.0,31.0,18.0,25.0,0
55,2018-09-29,11800005,2018,0,1610612744,110.0,0.473,0.769,0.353,33.0,48.0,home,L,Warriors,Golden State,44.0,93.0,12.0,34.0,10.0,13.0,0


In [93]:
nba_games.pipe(lambda x: pd.concat([x, dummy['wl_W']], axis=1))

Unnamed: 0,game_date_est,game_id,season,home_team_wins,team_id,pts,fg_pct,ft_pct,fg3_pct,ast,reb,ha,wl,nickname,city,fgm,fga,fg3m,fg3a,ftm,fta,wl_W
811,2018-09-28,11800002,2018,1,1610612738,97.000,0.346,0.727,0.191,12.000,56.000,away,L,Celtics,Boston,36.000,104.000,9.000,47.000,16.000,22.000,0
1671,2018-09-28,11800002,2018,1,1610612766,104.000,0.382,0.641,0.306,17.000,65.000,home,W,Hornets,Charlotte,34.000,89.000,11.000,36.000,25.000,39.000,1
164,2018-09-29,11800003,2018,1,1610612761,122.000,0.415,0.824,0.343,15.000,45.000,home,W,Raptors,Toronto,34.000,82.000,12.000,35.000,42.000,51.000,1
422,2018-09-29,11800003,2018,1,1610612757,104.000,0.420,0.720,0.387,25.000,46.000,away,L,Trail Blazers,Portland,37.000,88.000,12.000,31.000,18.000,25.000,0
55,2018-09-29,11800005,2018,0,1610612744,110.000,0.473,0.769,0.353,33.000,48.000,home,L,Warriors,Golden State,44.000,93.000,12.000,34.000,10.000,13.000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,2019-06-07,41800404,2018,0,1610612761,105.000,0.419,0.958,0.313,22.000,39.000,away,W,Raptors,Toronto,36.000,86.000,10.000,32.000,23.000,24.000,1
56,2019-06-10,41800405,2018,0,1610612744,106.000,0.463,0.714,0.476,27.000,37.000,away,W,Warriors,Golden State,38.000,82.000,20.000,42.000,10.000,14.000,1
109,2019-06-10,41800405,2018,0,1610612761,105.000,0.447,0.778,0.250,19.000,43.000,home,L,Raptors,Toronto,38.000,85.000,8.000,32.000,21.000,27.000,0
0,2019-06-13,41800406,2018,0,1610612744,110.000,0.488,0.700,0.355,28.000,42.000,home,L,Warriors,Golden State,39.000,80.000,11.000,31.000,21.000,30.000,0


In [101]:
nba_games_w = nba_games.merge(dummy).drop(columns='wl_L').rename(columns={'wl_W':'win'})
nba_games_w

Unnamed: 0,game_date_est,game_id,season,home_team_wins,team_id,pts,fg_pct,ft_pct,fg3_pct,ast,reb,ha,wl,nickname,city,fgm,fga,fg3m,fg3a,ftm,fta,win
0,2018-09-28,11800002,2018,1,1610612738,97.000,0.346,0.727,0.191,12.000,56.000,away,L,Celtics,Boston,36.000,104.000,9.000,47.000,16.000,22.000,0
1,2018-09-28,11800002,2018,1,1610612766,104.000,0.382,0.641,0.306,17.000,65.000,home,W,Hornets,Charlotte,34.000,89.000,11.000,36.000,25.000,39.000,1
2,2018-09-29,11800003,2018,1,1610612761,122.000,0.415,0.824,0.343,15.000,45.000,home,W,Raptors,Toronto,34.000,82.000,12.000,35.000,42.000,51.000,1
3,2018-09-29,11800003,2018,1,1610612757,104.000,0.420,0.720,0.387,25.000,46.000,away,L,Trail Blazers,Portland,37.000,88.000,12.000,31.000,18.000,25.000,0
4,2018-09-29,11800005,2018,0,1610612744,110.000,0.473,0.769,0.353,33.000,48.000,home,L,Warriors,Golden State,44.000,93.000,12.000,34.000,10.000,13.000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2751,2019-06-07,41800404,2018,0,1610612761,105.000,0.419,0.958,0.313,22.000,39.000,away,W,Raptors,Toronto,36.000,86.000,10.000,32.000,23.000,24.000,1
2752,2019-06-10,41800405,2018,0,1610612744,106.000,0.463,0.714,0.476,27.000,37.000,away,W,Warriors,Golden State,38.000,82.000,20.000,42.000,10.000,14.000,1
2753,2019-06-10,41800405,2018,0,1610612761,105.000,0.447,0.778,0.250,19.000,43.000,home,L,Raptors,Toronto,38.000,85.000,8.000,32.000,21.000,27.000,0
2754,2019-06-13,41800406,2018,0,1610612744,110.000,0.488,0.700,0.355,28.000,42.000,home,L,Warriors,Golden State,39.000,80.000,11.000,31.000,21.000,30.000,0


In [102]:
nba_games['game_date_est'].dtype

dtype('O')

Currently, the dates are stored as objects (so treated equally(???) w/o any ordering)

Use `pd.to_datetime()` to convert to a date variable

In [104]:
import datetime
nba_games_d = (nba_games_w
               .assign(game_date=lambda x: pd.to_datetime(x['game_date_est']))
               .drop(columns='game_date_est'))
nba_games_d.dtypes

game_id                    int64
season                     int64
home_team_wins             int64
team_id                    int64
pts                      float64
fg_pct                   float64
ft_pct                   float64
fg3_pct                  float64
ast                      float64
reb                      float64
ha                        object
wl                        object
nickname                  object
city                      object
fgm                      float64
fga                      float64
fg3m                     float64
fg3a                     float64
ftm                      float64
fta                      float64
win                        uint8
game_date         datetime64[ns]
dtype: object

In [105]:
nba_games_d.head()

Unnamed: 0,game_id,season,home_team_wins,team_id,pts,fg_pct,ft_pct,fg3_pct,ast,reb,ha,wl,nickname,city,fgm,fga,fg3m,fg3a,ftm,fta,win,game_date
0,11800002,2018,1,1610612738,97.0,0.346,0.727,0.191,12.0,56.0,away,L,Celtics,Boston,36.0,104.0,9.0,47.0,16.0,22.0,0,2018-09-28
1,11800002,2018,1,1610612766,104.0,0.382,0.641,0.306,17.0,65.0,home,W,Hornets,Charlotte,34.0,89.0,11.0,36.0,25.0,39.0,1,2018-09-28
2,11800003,2018,1,1610612761,122.0,0.415,0.824,0.343,15.0,45.0,home,W,Raptors,Toronto,34.0,82.0,12.0,35.0,42.0,51.0,1,2018-09-29
3,11800003,2018,1,1610612757,104.0,0.42,0.72,0.387,25.0,46.0,away,L,Trail Blazers,Portland,37.0,88.0,12.0,31.0,18.0,25.0,0,2018-09-29
4,11800005,2018,0,1610612744,110.0,0.473,0.769,0.353,33.0,48.0,home,L,Warriors,Golden State,44.0,93.0,12.0,34.0,10.0,13.0,0,2018-09-29
