<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Real-Plus/Minus" data-toc-modified-id="Real-Plus/Minus-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Real Plus/Minus</a></span></li></ul></div>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import zscore

from regression_helper import multiple_regression_big, multiple_regression_big_with_penalty

## Real Plus/Minus

In [None]:
stints = pd.read_csv('nba_stints_2015_binary.csv.gz')
stints.head()

In [None]:
players = list(stints.columns[3:])
ind_vars = list(stints.columns[2:])
apm = multiple_regression_big('net_rtg', ind_vars, stints, weights='net_poss')
apm.head(10)

In [None]:
apm.plot.hist(bins=50);

In [None]:
apm_HCA = apm['HCA']
print("Home Court Advantage for Net Rating: {:.2f}".format(apm_HCA))
print()
print("Top 20 by APM\n" + 40*"=")
print(apm[players].sort_values(ascending=False)[:20].to_string())
print()
print("Bottom 20 by APM\n" + 40*"=")
print(apm[players].sort_values(ascending=True)[:20].to_string())

In [None]:
rapm = multiple_regression_big_with_penalty('net_rtg', ind_vars, stints, weights='net_poss', penalty=1000.)
rapm.head(10)

In [None]:
rapm.plot.hist(bins=50)

rapm_HCA = rapm['HCA']
print("Home Court Advantage for Net Rating: {:.2f}".format(rapm_HCA))
print()
print("Top 20 by RAPM\n" + 40*"=")
print(rapm[players].sort_values(ascending=False)[:20].to_string())
print()
print("Bottom 20 by RAPM\n" + 40*"=")
print(rapm[players].sort_values(ascending=True)[:20].to_string())

In [None]:
stints.head()

In [None]:
players = list(stints.columns[3:])

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
import tqdm
from sklearn.metrics import recall_score, precision_score

In [None]:
from scipy.sparse import csc_matrix

def _sdf_to_csc(sdf, dtype=np.float64):
    cols, rows, datas = [], [], []
    for col, name in enumerate(sdf):
        s = sdf[name]
        row = s.sp_index.to_int_index().indices
        cols.append(np.repeat(col, len(row)))
        rows.append(row)
        datas.append(s.sp_values.astype(dtype, copy=False))

    cols = np.concatenate(cols)
    rows = np.concatenate(rows)
    datas = np.concatenate(datas)
    return csc_matrix((datas, (rows, cols)), shape=sdf.shape)


In [None]:
from scipy.sparse import hstack

In [None]:
c = 1e6
model_logit = LogisticRegression(C=c, fit_intercept=True)
model = LinearRegression() # LogisticRegression(C=c, fit_intercept=True)
w = stints['net_poss']

tmp_stints = stints.abs().to_sparse(fill_value=0)
X_full = _sdf_to_csc(tmp_stints)
scores = []
for i in tqdm.tqdm_notebook(range(len(players))):
    player_i = players[i]
    j = i + 3
    y = np.asarray(X_full[:, j].todense()).ravel()
    X = hstack([X_full[:, :j], X_full[:, j+1:]])
    model.fit(X, y, sample_weight=w)
    model_logit.fit(X, y, sample_weight=w)
#     s = model.score(X, y, sample_weight=w)
    ss_res = np.sum(w * (y - model.predict(X))**2) / w.sum()
    ss_tot = np.sum(w * (y - y.mean())**2) / w.sum()
    r_sq = 1 - ss_res / ss_tot
    
    y_pred = model_logit.predict(X)
    recall = recall_score(y, y_pred, sample_weight=w)
    prec = precision_score(y, y_pred, sample_weight=w)
    player_poss = (y * w).sum()
    scores.append({'player': player_i, 'r_sq': r_sq, 'recall': recall, 'prec': prec, 'poss': player_poss})

In [None]:
scores = pd.DataFrame(scores).set_index('player')

In [None]:
scores.sort_values('prec', ascending=True)

In [None]:
c = 1e6
model_logit = LogisticRegression(C=c, fit_intercept=True)
model = LinearRegression() # LogisticRegression(C=c, fit_intercept=True)
w = stints['net_poss']

tmp_stints = stints.values
nrows = stints.shape[0]
for row in range(nrows):
    np.random.shuffle(tmp_stints[row, 3:])
    

In [None]:
tmp_stints = pd.DataFrame(tmp_stints, columns=stints.columns)
tmp_stints = tmp_stints.abs().to_sparse(fill_value=0)
X_full = _sdf_to_csc(tmp_stints)
scores = []
for i in tqdm.tqdm_notebook(range(len(players))):
    player_i = players[i]
    j = i + 3
    y = np.asarray(X_full[:, j].todense()).ravel()
    X = hstack([X_full[:, :j], X_full[:, j+1:]])
    model.fit(X, y, sample_weight=w)
    model_logit.fit(X, y, sample_weight=w)
#     s = model.score(X, y, sample_weight=w)
    ss_res = np.sum(w * (y - model.predict(X))**2) / w.sum()
    ss_tot = np.sum(w * (y - y.mean())**2) / w.sum()
    r_sq = 1 - ss_res / ss_tot
    
    y_pred = model_logit.predict(X)
    recall = recall_score(y, y_pred, sample_weight=w)
    prec = precision_score(y, y_pred, sample_weight=w)
    player_poss = (y * w).sum()
    scores.append({'player': player_i, 'r_sq': r_sq, 'recall': recall, 'prec': prec, 'poss': player_poss})

In [None]:
scores = pd.DataFrame(scores).set_index('player')
scores.sort_values('prec', ascending=True)

In [None]:
df = pd.read_csv('nba_stints_2015_full.csv.gz')
df.head()

In [None]:
cp3_away_mask = df['away_unit'].str.contains("Chris Paul")
cp3_home_mask = df['home_unit'].str.contains("Chris Paul")

In [None]:
cp3_lineups = set(df.loc[cp3_away_mask, 'away_unit'].unique()).union(df.loc[cp3_home_mask, 'home_unit'].unique())

In [None]:
cp3_lineups

In [None]:
model = LogisticRegression(C=c, fit_intercept=True)

i = players.index("Manu Ginobili")
player_i = players[i]
j = i + 3
y = np.asarray(X_full[:, j].todense()).ravel()
X = hstack([X_full[:, :j], X_full[:, j+1:]])
model.fit(X, y, sample_weight=w)
y_pred = model.predict(X)
s = model.score(X, y, sample_weight=w)
np.abs((y - y_pred)).sum(), s

In [None]:
clippers_mask = (df['away'] == 'Clippers') | (df['home'] == 'Clippers')

clippers_df = df.loc[clippers_mask]

dj_away_mask = clippers_df['away_unit'].str.contains("DeAndre Jordan")
dj_home_mask = clippers_df['home_unit'].str.contains("DeAndre Jordan")
dj_mask = dj_away_mask | dj_home_mask

hawes_away_mask = clippers_df['away_unit'].str.contains("Spencer Hawes")
hawes_home_mask = clippers_df['home_unit'].str.contains("Spencer Hawes")
hawes_mask = hawes_away_mask | hawes_home_mask

In [None]:
(clippers_df.loc[dj_mask, 'away_poss'].sum() + clippers_df.loc[dj_mask, 'home_poss'].sum()) / \
    (clippers_df['away_poss'].sum() + clippers_df['home_poss'].sum())

In [None]:
clippers_df.loc[hawes_mask, ['away_poss', 'home_poss']].sum().sum() / \
    clippers_df[['away_poss', 'home_poss']].sum().sum()

In [None]:
clippers_df.loc[(hawes_mask & dj_mask), ['away_poss', 'home_poss']].sum().sum() / clippers_df[['away_poss', 'home_poss']].sum().sum()

In [None]:
clippers_df.loc[(hawes_mask & dj_mask), ['away_poss', 'home_poss']].sum().sum() / clippers_df.loc[hawes_mask, ['away_poss', 'home_poss']].sum().sum()

In [None]:
grizz_mask = (df['away'] == 'Grizzlies') | (df['home'] == 'Grizzlies')

grizz_df = df.loc[grizz_mask]

gasol_away_mask = grizz_df['away_unit'].str.contains("Marc Gasol")
gasol_home_mask = grizz_df['home_unit'].str.contains("Marc Gasol")
gasol_mask = gasol_away_mask | gasol_home_mask

koufos_away_mask = grizz_df['away_unit'].str.contains("Kosta Koufos")
koufos_home_mask = grizz_df['home_unit'].str.contains("Kosta Koufos")
koufos_mask = koufos_away_mask | koufos_home_mask

In [None]:
(grizz_df.loc[gasol_mask, ['away_poss', 'home_poss']].sum().sum()) / \
    (grizz_df[['away_poss', 'home_poss']].sum().sum())

In [None]:
(grizz_df.loc[koufos_mask, ['away_poss', 'home_poss']].sum().sum()) / \
    (grizz_df[['away_poss', 'home_poss']].sum().sum())

In [None]:
grizz_df.loc[(koufos_mask & gasol_mask), ['away_poss', 'home_poss']].sum().sum() / \
    grizz_df[['away_poss', 'home_poss']].sum().sum()

In [None]:
grizz_df.loc[(koufos_mask & gasol_mask), ['away_poss', 'home_poss']].sum().sum() / \
    grizz_df.loc[koufos_mask, ['away_poss', 'home_poss']].sum().sum()

In [None]:
grizz_df.loc[(koufos_mask & gasol_mask), ['away_poss', 'home_poss']].sum().sum() / \
    grizz_df.loc[gasol_mask, ['away_poss', 'home_poss']].sum().sum()