In [None]:
import sys
sys.path.append("..")

import bpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pystan
import seaborn as sns

from framework.utils import *

np.random.seed(42)

In [None]:
%matplotlib inline

# Testing player level code

In [None]:
model = pystan.StanModel(file="player_forecasts.stan")

In [None]:
# numbers of players and matches
n_player = 500
n_match = 30

# goals scored by team
goals = np.random.poisson(lam=2.0, size=(n_player, n_match))

# player level parameters
theta = np.random.dirichlet(np.ones(3), size=n_player)

# minutes played
minutes = np.clip(
    np.random.normal(loc=60., scale=10., size=(n_player, n_match)),
    0.0,
    90.0
)

# augmented scoring probabilities
theta_aug = np.array([
    [theta[i, 0]*(minutes[i, j] / 90.0), 
     theta[i, 1]*(minutes[i, j] / 90.0),
     theta[i, 2]*(minutes[i, j] / 90.0) + (90.0 - minutes[i, j]) / 90.0]
    for i in range(n_player)
    for j in range(n_match)
]).reshape(n_player, n_match, 3)

# results for the player
y = np.array([
    np.random.multinomial(goals[i, j], theta_aug[i, j])
    for i in range(n_player)
    for j in range(n_match)
]).reshape(n_player, n_match, 3)

In [None]:
stan_data = dict(
    nplayer=n_player,
    nmatch=n_match,
    minutes=minutes,
    y=y,
    alpha=[1, 1, 1]
)

In [None]:
fit = model.optimizing(stan_data)

## Test on real data

### 1. Fit the team level model

In [None]:
df_past = pd.DataFrame(np.array([[s.date, s.home_team, s.away_team, s.home_score, s.away_score]
                        for s in session.query(Match).all()]),
             columns=["date", "home_team", "away_team", "home_goals", "away_goals"])
df_past["home_goals"] = df_past["home_goals"].astype(int)
df_past["away_goals"] = df_past["away_goals"].astype(int)
df_past["date"] = pd.to_datetime(df_past["date"])
df_past = df_past[df_past["date"] > "2016-08-01"]

In [None]:
model_team = bpl.BPLModel(df_past)
model_team.fit()

## 2. Fit player level model

In [None]:
def get_empirical_bayes_estimates(df_emp):
    # still not sure about this...
    df = df_emp.copy()
    df = df[df["match_id"] != 0]
    goals = df["goals"].sum()
    assists = df["assists"].sum()
    neither = df["neither"].sum()
    minutes = df["minutes"].sum()
    team = df["team_goals"].sum()
    total_minutes = 90*len(df)
    neff = df.groupby("player_name").count()["goals"].mean()
    a0 = neff * (goals / team) * (total_minutes / minutes)
    a1 = neff *(assists / team) * (total_minutes / minutes)
    a2 = neff *((neither / team) - (total_minutes - minutes) / total_minutes) * (total_minutes / minutes)
    alpha = np.array([a0, a1, a2])
    return alpha

def process_player_data(prefix):
    df = pd.read_csv("../data/player_history_{}.csv".format(prefix))
    df["neither"] = df["team_goals"] - df["goals"] - df["assists"]
    alpha = get_empirical_bayes_estimates(df)
    y = df.sort_values("player_name")[
        ["goals", "assists", "neither"]
    ].values.reshape((df["player_name"].nunique(),
                      df.groupby("player_name").count().iloc[0]["player_id"],
                      3))

    minutes = df.sort_values("player_name")[
        "minutes"
    ].values.reshape((df["player_name"].nunique(),
                      df.groupby("player_name").count().iloc[0]["player_id"]))

    nplayer = df["player_name"].nunique()
    nmatch = df.groupby("player_name").count().iloc[0]["player_id"]
    player_names = np.sort(df["player_name"].unique())
    return dict(
        nplayer=nplayer, 
        nmatch=nmatch,
        minutes=minutes,
        y=y,
        alpha=alpha
    ), player_names

def fit_data(prefix, model):
    data, names = process_player_data(prefix)
    fit = model.optimizing(data)
    df = pd.DataFrame(
        fit["theta"],
        columns=["pr_score", "pr_assist", "pr_neither"]
    ).set_index(names).reset_index()
    df["pos"] = prefix
    return df

def fit_all_data(model):
    df = pd.DataFrame()
    return pd.concat(
        [fit_data(prefix, model) for prefix in ["FWD", "MID", "DEF"]]
    ).rename(
        columns={"index": "player_name"}
    ).sort_values("player_name").set_index("player_name")

In [None]:
df_stat = fit_all_data(model)
for pos in ["FWD", "MID", "DEF"]:
    plt.scatter(
        df_stat.loc[df_stat["pos"] == pos ,"pr_assist"],
        df_stat.loc[df_stat["pos"] == pos ,"pr_score"],
        label=pos
    )
plt.xlabel("P(assist)")
plt.ylabel("P(score)")
plt.legend();