In [1]:

import sys
import os

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import tqdm

import torch
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn

from IPython.display import clear_output
from IPython import display

from rl_env import (
    DraftEnv, ACTION_SPACE_DIM,
    STATE_SCHEMA, STARTER_COMPOSITION,
    STATE_SPACE_DIM, NUM_DRAFT_ROUNDS,
    NUM_MGRS
)

%matplotlib inline


https://medium.com/@ym1942/policy-gradient-methods-from-reinforce-to-actor-critic-d56ff0f0af0a

https://medium.com/@ym1942/proximal-policy-optimization-tutorial-f722f23beb83

# Init players

In [2]:
df_sleeper = pd.read_csv("data/sleeper/all_players.csv")
# df_sleeper = df_sleeper.loc[df_sleeper["rank_int"] < 50]
print(df_sleeper.shape)
df_sleeper.columns

(3670, 56)


Index(['sleeper_id', 'injury_notes', 'oddsjam_id', 'search_last_name',
       'college', 'team_abbr', 'birth_state', 'yahoo_id', 'search_first_name',
       'pandascore_id', 'years_exp', 'fantasy_positions', 'injury_status',
       'active', 'team', 'competitions', 'high_school', 'number', 'birth_city',
       'search_full_name', 'last_name', 'birth_country',
       'practice_description', 'depth_chart_order', 'fantasy_data_id',
       'status', 'injury_start_date', 'news_updated', 'age', 'search_rank',
       'practice_participation', 'opta_id', 'full_name', 'swish_id',
       'birth_date', 'rotowire_id', 'weight', 'height', 'stats_id', 'espn_id',
       'metadata', 'player_id', 'first_name', 'sport', 'injury_body_part',
       'position', 'gsis_id', 'sportradar_id', 'hashtag',
       'depth_chart_position', 'rotoworld_id', 'Player Id', 'positional_rank',
       'adp', 'adp_date', 'rank_int'],
      dtype='object')

In [3]:
df_qb_proj = pd.read_csv("data/projections/QB_projections.csv")
df_rb_proj = pd.read_csv("data/projections/RB_projections.csv")
df_wr_proj = pd.read_csv("data/projections/WR_projections.csv")
df_te_proj = pd.read_csv("data/projections/TE_projections.csv")
df_k_proj = pd.read_csv("data/projections/K_projections.csv")
df_def_proj = pd.read_csv("data/projections/DEF_projections.csv")

df_qb_proj = df_qb_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)
df_rb_proj = df_rb_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)
df_wr_proj = df_wr_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)
df_te_proj = df_te_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)
df_k_proj = df_k_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)
df_def_proj = df_def_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)

df_proj = pd.concat([df_qb_proj, df_rb_proj, df_wr_proj, df_te_proj, df_k_proj, df_def_proj])
df_proj = pd.concat([df_qb_proj, df_rb_proj, df_wr_proj, df_te_proj, df_k_proj, df_def_proj])
df_proj.columns

df_proj_agg = df_proj.groupby('sleeper_id')['fpts'].agg(['mean', 'std']).reset_index()
df_proj_agg['sleeper_id'] = df_proj_agg['sleeper_id'].astype(str)


df_players = df_proj_agg.merge(df_sleeper.loc[:, ['sleeper_id', 'full_name', 'position', 'team']], 
                                on='sleeper_id', 
                                how='left')
print(df_players.shape)
df_players = df_players.dropna(subset=['mean', 'std'])
print(df_players.shape)




(796, 6)
(647, 6)


# Constants

In [4]:
SCORING = {
    'pass_yd': .04,
    'pass_td': 4,
    'pass_2pt': 2,
    'pass_int': -1,
    'rush_yd': .1,
    'rush_td': 6,
    'rush_2pt': 2,
    'rec_rcpt': .5,
    'rec_yd': .1,
    'rec_td': 6,
    'rec_2pt': 2,
    'fg_1_39_made': 3,
    'fg_40_49_made': 4,
    'fg_50_made': 5,
    'xp_missed': -1,
    'xp_made': 1,
    'def_td': 6,
    'def_int': 2,
    'def_sck': 1,
    'def_fum_rec': 2,
    'def_fum_forced': 1,
    'def_sfty': 2,
    'remainder': 1
}

POSITIONS = ['QB', 'RB', 'WR', 'TE', 'K', 'DEF']
STARTER_COMPOSITION = {'QB': 1, 'RB': 2, 'WR': 2, 'FLEX': 2, 'TE': 1, 'K': 1, 'DEF': 1}
TEAM_SIZE = 15

# Init plotting and compute

In [5]:

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display
plt.ion()

# if GPU is to be used
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)

# Init network

In [36]:


class RNN(nn.Module):
    """
    Basic RNN block. This represents a single layer of RNN
    https://solardevs.com/blog/rnn-from-scratch-pytorch/
    """
    def __init__(self, input_size: int, hidden_size: int, output_size: int, batch_size: int) -> None:
        """
        input_size: Number of features of your input vector
        hidden_size: Number of hidden neurons
        output_size: Number of features of your output vector
        """
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.i2h = nn.Linear(input_size, hidden_size, bias=False)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden_state) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Returns computed output and tanh(i2h + h2h)
        Inputs
        ------
        x: Input vector
        hidden_state: Previous hidden state
        Outputs
        -------
        out: Linear output (without activation because of how pytorch works)
        hidden_state: New hidden state matrix
        """
        x = self.i2h(x)
        hidden_state = self.h2h(hidden_state)
        hidden_state = torch.tanh(x + hidden_state)
        out = self.h2o(hidden_state)
        return out, hidden_state
        
    def init_zero_hidden(self, batch_size=1) -> torch.Tensor:
        """
				Helper function.
        Returns a hidden state with specified batch size. Defaults to 1
        """
        return torch.zeros(batch_size, self.hidden_size, requires_grad=False)

In [37]:
class NeuralNet(torch.nn.Module):
    def __init__(self, input_size, output_size, activation, layers=[32,32,16]):
        super().__init__()

        # Define layers with ReLU activation
        self.linear1 = torch.nn.Linear(input_size, layers[0])
        self.activation1 = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(layers[0], layers[1])
        self.activation2 = torch.nn.ReLU()
        self.linear3 = torch.nn.Linear(layers[1], layers[2])
        self.activation3 = torch.nn.ReLU()

        self.output_layer = torch.nn.Linear(layers[2], output_size)
        self.output_activation = activation

        # Initialization using Xavier normal (a popular technique for initializing weights in NNs)
        torch.nn.init.xavier_normal_(self.linear1.weight)
        torch.nn.init.xavier_normal_(self.linear2.weight)
        torch.nn.init.xavier_normal_(self.linear3.weight)
        torch.nn.init.xavier_normal_(self.output_layer.weight)

    def forward(self, inputs):
        # Forward pass through the layers
        x = self.activation1(self.linear1(inputs))
        x = self.activation2(self.linear2(x))
        x = self.activation3(self.linear3(x))
        x = self.output_activation(self.output_layer(x))
        return x


# Generate episode function

In [38]:

def generate_single_episode(env, policy_net_dict):
    """
    Generates an episode by executing the current policy in the given env
    """
    mgrs = {}
    for i in range(NUM_MGRS):
        mgrs[i] = {}
        mgrs[i]['states'] = []
        mgrs[i]['actions'] = []
        mgrs[i]['rewards'] = []
        mgrs[i]['log_probs'] = []
        
    state, _ = env.reset()
    
    mgr_turns = [d["mgr"] for d in env.turns]
    for turn_i, mgr_num in enumerate(mgr_turns):
        policy_net = policy_net_dict[mgr_num]
        state = torch.from_numpy(state).float().unsqueeze(0)
        state = state.to(device)
        probs = policy_net.forward(Variable(state)) # get each action choice probability with the current policy network
        action = np.random.choice(env.action_space.n, p=np.squeeze(probs.detach().cpu().numpy())) # probablistic
        # action = np.argmax(probs.detach().numpy()) # greedy
        
        # compute the log_prob to use this in parameter update
        log_prob = torch.log(probs.squeeze(0)[action])
        
        # append values
        mgrs[mgr_num]['states'].append(state)
        mgrs[mgr_num]['actions'].append(action)
        mgrs[mgr_num]['log_probs'].append(log_prob)
        
        # take a selected action
        state, reward, terminated, _ = env.step(action)
        mgrs[mgr_num]['rewards'].append(reward)

        # Never terminate early because we always have full draft
        # if terminated | truncated:
        #     break
            
    # return states, actions, rewards, log_probs
    return mgrs  # NEED TO UPDATE ELSEWHERE TO ACCOMMODATE THIS CHANGE

# Evaluate policy function

In [39]:

# def evaluate_policy(env, policy_net):
#     """
#     Compute accumulative trajectory reward
#     """
#     states, actions, rewards, log_probs = generate_single_episode(env, policy_net)
#     return np.sum(rewards)

def evaluate_policy(env, policy_net_dict):
    """
    Compute accumulative trajectory reward
    """
    episode = generate_single_episode(env, policy_net_dict)
    rewards = {k: np.sum(v['rewards']) for k, v in episode.items()}
    return rewards

# Train function

In [48]:
def train_PPO(env, episode, policy_net_dict, policy_optimizer_dict, value_net_dict, value_optimizer_dict, num_epochs, clip_val=0.2, gamma=0.99):
    """
    Trains the policy network using PPO
    """

    # Generate an episode with the current policy network
    # states, actions, rewards, log_probs = generate_single_episode(env, policy_net)
    # episode = generate_single_episode(env, policy_net) 
    for mgr_num, d in episode.items():
        states = d['states']
        actions = d['actions']
        rewards = d['rewards']
        log_probs = d['log_probs']
        
        policy_net = policy_net_dict[mgr_num]
        policy_optimizer = policy_optimizer_dict[mgr_num]
        value_net = value_net_dict[mgr_num]
        value_optimizer = value_optimizer_dict[mgr_num]
        
        T = len(states)
        
        # Create tensors
        # states = np.vstack(states).astype(float)
        states = np.vstack([s.cpu().numpy() for s in states]).astype(float)
        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).to(device).view(-1,1)
        rewards = torch.FloatTensor(rewards).to(device).view(-1,1)
        log_probs = torch.FloatTensor(log_probs).to(device).view(-1,1)

        # Compute total discounted return at each time step
        Gs = []
        G = 0
        for t in range(T-1,-1,-1): # iterate in backward order to make the computation easier
            G = rewards[t] + gamma*G
            Gs.insert(0,G)
        Gs = torch.tensor(Gs).view(-1,1)
        Gs = Gs.to(device)
        
        # Compute the advantage
        state_vals = value_net(states).to(device)
        with torch.no_grad():
            A_k = Gs - state_vals
            
        for _ in range(num_epochs):
            V = value_net(states).to(device)
            
            # Calculate probability of each action under the updated policy
            probs = policy_net.forward(states).to(device)
                    
            # compute the log_prob to use it in parameter update
            curr_log_probs = torch.log(torch.gather(probs, 1, actions)) # Use torch.gather(A,1,B) to select columns from A based on indices in B
            
            # Calculate ratios r(theta)
            ratios = torch.exp(curr_log_probs - log_probs)
            
            # Calculate two surrogate loss terms in cliped loss
            surr1 = ratios * A_k
            surr2 = torch.clamp(ratios, 1-clip_val, 1+clip_val) * A_k
            
            # Calculate clipped loss value
            actor_loss = (-torch.min(surr1, surr2)).mean() # Need negative sign to run Gradient Ascent
            
            # Update policy network
            policy_optimizer.zero_grad()
            actor_loss.backward(retain_graph=True)
            policy_optimizer.step()
            
            # Update value net
            critic_loss = nn.MSELoss()(V, Gs)
            value_optimizer.zero_grad()
            critic_loss.backward()
            value_optimizer.step()
        
    return policy_net_dict, value_net_dict

# Run training

In [49]:
# Define parameter values
env_name = 'DraftEnv'
num_train_ite = 1000
num_seeds = 5 # fit model with 5 different seeds and plot average performance of 5 seeds
num_epochs = 30 # how many times we iterate the entire training dataset passing through the training
eval_freq = 50 # run evaluation of policy at each eval_freq trials
eval_epi_index = num_train_ite//eval_freq # use to create x label for plot
returns = np.zeros((num_seeds, eval_epi_index))
gamma = 0.99 # discount factor
clip_val = 0.2 # hyperparameter epsilon in clip objective

# Create the environment.
env = DraftEnv(df_players)
nA = env.action_space.n
nS = env.observation_space.shape[0]

policy_lr = 5e-4 # policy network's learning rate 
baseline_lr = 1e-4 # value network's learning rate
 
for i in tqdm.tqdm(range(num_seeds)):
    reward_means = []

    # Define policy and value networks
    policy_net_dict = {i: NeuralNet(nS, nA, torch.nn.Softmax(dim=1)).to(device) for i in range(NUM_MGRS)}
    policy_optimizer_dict = {i: optim.Adam(policy_net_dict[i].parameters(), lr=policy_lr) for i in range(NUM_MGRS)}
    value_net_dict = {i: NeuralNet(nS, 1, torch.nn.ReLU()).to(device) for i in range(NUM_MGRS)}
    value_optimizer_dict = {i: optim.Adam(value_net_dict[i].parameters(), lr=baseline_lr) for i in range(NUM_MGRS)}
    
    
    
    for m in range(num_train_ite):
        # Train networks with PPO
        episode = generate_single_episode(env, policy_net_dict) 
        for i in range(NUM_MGRS):
            policy_net_dict, value_net_dict = train_PPO(env, episode, policy_net_dict, policy_optimizer_dict, value_net_dict, value_optimizer_dict, num_epochs, clip_val=clip_val, gamma=gamma)
            
            
            # policy_net, value_net = train_PPO(env, episode, policy_net, policy_net_optimizer, value_net, value_net_optimizer, num_epochs, clip_val=clip_val, gamma=gamma)
        if m % eval_freq == 0:
            print("Episode: {}".format(m))
            G = np.zeros(20)
            for k in range(20): # run 20 evaluations
                g = evaluate_policy(env, policy_net_dict) # dict of rewards instead of scalar reward
                g = np.array(list(g.values()))
                g = np.mean(g)
                G[k] = g 

            reward_mean = G.mean()
            reward_sd = G.std()
            print("The avg. test reward for episode {0} is {1} with std of {2}.".format(m, reward_mean, reward_sd))
            reward_means.append(reward_mean)
    returns[i] = np.array(reward_means)

# Plot the performance over iterations
x = np.arange(eval_epi_index)*eval_freq
avg_returns = np.mean(returns, axis=0)
max_returns = np.max(returns, axis=0)
min_returns = np.min(returns, axis=0)

plt.fill_between(x, min_returns, max_returns, alpha=0.1)
plt.plot(x, avg_returns, '-o', markersize=1)

plt.xlabel('Episode', fontsize = 15)
plt.ylabel('Return', fontsize = 15)

plt.title("PPO Learning Curve", fontsize = 24)

Scaling mean points by 389.0108028723334


  0%|          | 0/5 [00:00<?, ?it/s]

Episode: 0
The avg. test reward for episode 0 is 1.91360541080298 with std of 0.4252809133571055.
Episode: 50
The avg. test reward for episode 50 is 1.4437118660244286 with std of 0.24333554689319453.
Episode: 100
The avg. test reward for episode 100 is 1.207625554923374 with std of 0.14334796262918725.
Episode: 150
The avg. test reward for episode 150 is 1.216222624227521 with std of 0.13470169114293917.
Episode: 200
The avg. test reward for episode 200 is 1.216222624227521 with std of 0.13470169114293917.
Episode: 250
The avg. test reward for episode 250 is 1.216222624227521 with std of 0.13470169114293917.
Episode: 300
The avg. test reward for episode 300 is 1.3175952575312468 with std of 0.12311904460892614.
Episode: 350
The avg. test reward for episode 350 is 1.270699263073589 with std of 0.13703147792844803.
Episode: 400
The avg. test reward for episode 400 is 1.4732059064714842 with std of 0.1492427848483806.


  0%|          | 0/5 [2:43:38<?, ?it/s]


ValueError: probabilities contain NaN

In [51]:
env.draft


Unnamed: 0,round,mgr,sleeper_id,full_name,team,position,team_pos,fp_mean,fp_std
0,0,0,3678,Wil Lutz,DEN,K,K,0.308629,5.089597
1,0,1,6786,CeeDee Lamb,DAL,WR,WR,0.780495,43.238199
2,0,2,11539,Jake Bates,DET,K,K,0.307618,4.041452
3,0,3,HOU,Houston Texans,HOU,DEF,DEF,0.381082,35.696201
4,0,4,2133,Davante Adams,LV,WR,WR,,
...,...,...,...,...,...,...,...,...,...
175,14,7,,,,,,,
176,14,8,,,,,,,
177,14,9,11604,Brock Bowers,LV,TE,TE,,
178,14,10,9508,Tyjae Spears,TEN,RB,RB,,


In [None]:
episode = generate_single_episode(env, policy_net_dict) 
episode

{0: {'states': [tensor([[1.0000, 0.9844, 0.9274, 0.9037, 0.8586, 0.8530, 0.8528, 0.8524, 0.8345,
            0.8215, 0.8625, 0.7142, 0.6961, 0.6692, 0.6482, 0.6251, 0.6169, 0.5975,
            0.5917, 0.5848, 0.7805, 0.7493, 0.7015, 0.6882, 0.6809, 0.6473, 0.6441,
            0.6075, 0.5852, 0.5668, 0.5168, 0.5098, 0.4764, 0.4509, 0.4355, 0.4319,
            0.4218, 0.4213, 0.3993, 0.3949, 0.3752, 0.3704, 0.3684, 0.3683, 0.3582,
            0.3542, 0.3527, 0.3517, 0.3432, 0.3405, 0.4764, 0.4286, 0.4237, 0.4155,
            0.4106, 0.4071, 0.4060, 0.3965, 0.3874, 0.3811, 0.0000, 1.0000, 0.0000,
            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
            0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0