In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import pymc3 as pm
from IPython.display import HTML
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
from IPython.core.pylabtools import figsize
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

P(model | data) = \frac{P(data | model)}{P(data)} P(model)

prior = P(model)

The prior is our belief in the model given no additional information. 

likelihood = P(data | model)

The likelihood is the probability of the data we observed occurring given the model.

marginal probability of data = P(data)

The marginal probability of the data is the probability that our data are observed regardless of what model we choose or believe in.

posterior = P(model | data)

The posterior is our updated belief in the model given the new data we have observed. Bayesian statistics are all about updating a prior belief we have about the world with new data, so we're transforming our prior belief into this new posterior belief about the world.


In [None]:
def occurrences(year, kd=True):
    '''occurences(2019, kd=True)
    By default, kd=True means with KD healthy'''
    # clean data
    # regular season
    data = pd.read_csv(f'./data/ab/{year}.txt', sep=',')
    new_columns = ['Rk', 'G', 'Date', 'Age', 'Tm', 'Away', 'Opp', 'Result', 'GS',
       'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-']
    data.columns=new_columns
    # replace did not dress with inactive
    data.GS = np.where(data.GS == 'Did Not Dress','Inactive',data.GS)
    if kd == False:
        game_logs = list(data[data.GS=='Inactive'].Result)
    else:
        game_logs = list(data[data.GS!='Inactive'].Result)
    results = [game.split(' ')[0] for game in game_logs]
    occurrences = [1 if result == 'W' else 0 for result in results]
    return occurrences

In [None]:
regular_season_with_kd = occurrences(2019, kd=True)+occurrences(2018, kd=True)+occurrences(2017, kd=True)
regular_season_no_kd = occurrences(2019, kd=False)+occurrences(2018, kd=False)+occurrences(2017, kd=False)
print(f'Observed win % when Kevin Durant plays: {round(np.mean(regular_season_with_kd),4)}')
print(f'Observed win % when Kevin Durant does not play: {round(np.mean(regular_season_no_kd),4)}')

In [None]:
# Instantiate
observations_A = regular_season_with_kd
observations_B = regular_season_no_kd

with pm.Model() as model:
    # Assume Uniform priors for p_A and p_B    
    p_A = pm.Uniform("p_A", 0.5, .9)
    p_B = pm.Uniform("p_B", 0.5, .9)

    # Define the deterministic delta function. This is our unknown of interest.
    # Delta is deterministic, no uncertainty beyond p_A and p_B
    delta = pm.Deterministic("delta", p_A - p_B)

    # We have two observation datasets: A, B
    # Posterior distribution is Bernoulli
    obs_A = pm.Bernoulli("obs_A", p_A, observed=observations_A)
    obs_B = pm.Bernoulli("obs_B", p_B, observed=observations_B)

    # Draw samples from the posterior distribution
    trace = pm.sample(20000)
    burned_trace=trace[1000:]

In [None]:
df = pm.summary(burned_trace).round(2)[['mean', 'sd', 'hpd_2.5', 'hpd_97.5']]
HTML(df.to_html(classes="table table-responsive table-striped table-bordered"))

In [None]:
#Unlike with confidence intervals (frequentist), there is a measure of probability with the credible interval.
#There is a 95% probability that the true win rate with KD is in the interval (0.68, 0.79).
#There is a 95% probability that the true win rate with no KD is in the interval (0.59, 0.85).

# Count the number of samples less than 0, i.e. the area under the curve
print("Probability that GSW is worse with Kevin Durant in the regular season: %.2f" % \
    np.mean(delta_samples < 0))

print("Probability that GSW is better with Kevin Durant in the regular season: %.2f" % \
    np.mean(delta_samples > 0))