In [5]:

import sys
import os

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import tqdm

import torch
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import torch.nn as nn

from IPython.display import clear_output
from IPython import display

from rl_env import (
    DraftEnv, ACTION_SPACE_DIM,
    STATE_SCHEMA, STARTER_COMPOSITION,
    STATE_SPACE_DIM, NUM_DRAFT_ROUNDS,
    NUM_MGRS
)

%matplotlib inline


https://medium.com/@ym1942/policy-gradient-methods-from-reinforce-to-actor-critic-d56ff0f0af0a

https://medium.com/@ym1942/proximal-policy-optimization-tutorial-f722f23beb83

# Init players

In [10]:
df_sleeper = pd.read_csv("data/sleeper/all_players.csv")
# df_sleeper = df_sleeper.loc[df_sleeper["rank_int"] < 50]
print(df_sleeper.shape)
df_sleeper.columns

(3670, 56)


Index(['sleeper_id', 'injury_notes', 'oddsjam_id', 'search_last_name',
       'college', 'team_abbr', 'birth_state', 'yahoo_id', 'search_first_name',
       'pandascore_id', 'years_exp', 'fantasy_positions', 'injury_status',
       'active', 'team', 'competitions', 'high_school', 'number', 'birth_city',
       'search_full_name', 'last_name', 'birth_country',
       'practice_description', 'depth_chart_order', 'fantasy_data_id',
       'status', 'injury_start_date', 'news_updated', 'age', 'search_rank',
       'practice_participation', 'opta_id', 'full_name', 'swish_id',
       'birth_date', 'rotowire_id', 'weight', 'height', 'stats_id', 'espn_id',
       'metadata', 'player_id', 'first_name', 'sport', 'injury_body_part',
       'position', 'gsis_id', 'sportradar_id', 'hashtag',
       'depth_chart_position', 'rotoworld_id', 'Player Id', 'positional_rank',
       'adp', 'adp_date', 'rank_int'],
      dtype='object')

In [11]:
df_qb_proj = pd.read_csv("data/projections/QB_projections.csv")
df_rb_proj = pd.read_csv("data/projections/RB_projections.csv")
df_wr_proj = pd.read_csv("data/projections/WR_projections.csv")
df_te_proj = pd.read_csv("data/projections/TE_projections.csv")
df_k_proj = pd.read_csv("data/projections/K_projections.csv")
df_def_proj = pd.read_csv("data/projections/DEF_projections.csv")

df_qb_proj = df_qb_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)
df_rb_proj = df_rb_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)
df_wr_proj = df_wr_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)
df_te_proj = df_te_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)
df_k_proj = df_k_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)
df_def_proj = df_def_proj.loc[:, ["sleeper_id", "full_name", "team", "position", "source", "fpts"]].sort_values(by="fpts", ascending=False)

df_proj = pd.concat([df_qb_proj, df_rb_proj, df_wr_proj, df_te_proj, df_k_proj, df_def_proj])
df_proj = pd.concat([df_qb_proj, df_rb_proj, df_wr_proj, df_te_proj, df_k_proj, df_def_proj])
df_proj.columns

df_proj_agg = df_proj.groupby('sleeper_id')['fpts'].agg(['mean', 'std']).reset_index()
df_proj_agg['sleeper_id'] = df_proj_agg['sleeper_id'].astype(str)


df_players = df_proj_agg.merge(df_sleeper.loc[:, ['sleeper_id', 'full_name', 'position', 'team']], 
                                on='sleeper_id', 
                                how='left')
print(df_players.shape)
df_players = df_players.dropna(subset=['mean', 'std'])
print(df_players.shape)




(796, 6)
(647, 6)


# Constants

In [4]:
SCORING = {
    'pass_yd': .04,
    'pass_td': 4,
    'pass_2pt': 2,
    'pass_int': -1,
    'rush_yd': .1,
    'rush_td': 6,
    'rush_2pt': 2,
    'rec_rcpt': .5,
    'rec_yd': .1,
    'rec_td': 6,
    'rec_2pt': 2,
    'fg_1_39_made': 3,
    'fg_40_49_made': 4,
    'fg_50_made': 5,
    'xp_missed': -1,
    'xp_made': 1,
    'def_td': 6,
    'def_int': 2,
    'def_sck': 1,
    'def_fum_rec': 2,
    'def_fum_forced': 1,
    'def_sfty': 2,
    'remainder': 1
}

POSITIONS = ['QB', 'RB', 'WR', 'TE', 'K', 'DEF']
STARTER_COMPOSITION = {'QB': 1, 'RB': 2, 'WR': 2, 'FLEX': 2, 'TE': 1, 'K': 1, 'DEF': 1}
TEAM_SIZE = 15

# Init plotting and compute

In [5]:

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display
plt.ion()

# if GPU is to be used
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)

# Init network

In [23]:


# class RNN(nn.Module):
#     """
#     Basic RNN block. This represents a single layer of RNN
#     https://solardevs.com/blog/rnn-from-scratch-pytorch/
#     """
#     def __init__(self, input_size: int, hidden_size: int, output_size: int, batch_size: int) -> None:
#         """
#         input_size: Number of features of your input vector
#         hidden_size: Number of hidden neurons
#         output_size: Number of features of your output vector
#         """
#         super().__init__()
#         self.input_size = input_size
#         self.hidden_size = hidden_size
#         self.output_size = output_size
#         self.batch_size = batch_size
#         self.i2h = nn.Linear(input_size, hidden_size, bias=False)
#         self.h2h = nn.Linear(hidden_size, hidden_size)
#         self.h2o = nn.Linear(hidden_size, output_size)
    
#     def forward(self, x, hidden_state) -> tuple[torch.Tensor, torch.Tensor]:
#         """
#         Returns computed output and tanh(i2h + h2h)
#         Inputs
#         ------
#         x: Input vector
#         hidden_state: Previous hidden state
#         Outputs
#         -------
#         out: Linear output (without activation because of how pytorch works)
#         hidden_state: New hidden state matrix
#         """
#         x = self.i2h(x)
#         hidden_state = self.h2h(hidden_state)
#         hidden_state = torch.tanh(x + hidden_state)
#         out = self.h2o(hidden_state)
#         return out, hidden_state
        
#     def init_zero_hidden(self, batch_size=1) -> torch.Tensor:
#         """
# 				Helper function.
#         Returns a hidden state with specified batch size. Defaults to 1
#         """
#         return torch.zeros(batch_size, self.hidden_size, requires_grad=False)

In [24]:
hidden_size = 32
class PolicyNet(nn.Module):
    def __init__(self):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(STATE_SPACE_DIM, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, ACTION_SPACE_DIM)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = self.fc3(x)
        x = nn.functional.softmax(x, dim=1)
        return x

# Generate episode function

In [25]:

def generate_single_episode(env, policy_net, rl_mgr: int=1):
    """
    Generates an episode by executing the current policy in the given env
    """
    states = []
    actions = []
    rewards = []
    log_probs = []
        
    state, _ = env.reset()
    
    mgr_turns = [d["mgr"] for d in env.turns]
    for turn_i, mgr_num in enumerate(mgr_turns):
        
        if mgr_num == rl_mgr:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            probs = policy_net.forward(state) # get each action choice probability with the current policy network
            action = np.random.choice(env.action_space.n, p=np.squeeze(probs.detach().cpu().numpy())) # probablistic
            # action = np.argmax(probs.detach().numpy()) # greedy
            
            # compute the log_prob to use this in parameter update
            log_prob = torch.log(probs.squeeze(0)[action])
            
            # append values
            states.append(state)
            actions.append(action)
            log_probs.append(log_prob)
            
            # take a selected action
            state, reward, terminated, _ = env.step(action)
            rewards.append(reward)
        else:
            env.step(env.reasonable_option())


    return states, actions, rewards, log_probs


In [26]:
# policy_net = PolicyNet()
# policy_net.to(device)

# num_episodes = 2500
# gamma = 0.99
# lr_policy_net = 2**-13
# optimizer = torch.optim.Adam(policy_net.parameters(), lr=lr_policy_net)

# for episode_num in range(num_episodes):
#     env = DraftEnv(df_players)
#     states, actions, rewards, log_probs = generate_single_episode(env, policy_net)
#     print(f"Episode {episode_num} completed with reward: {rewards[-1]}")
#     discounted_rewards = []
#     for t in range(len(rewards)):
#         Gt = 0  # Gt is the total discounted reward from time t
#         pw = 0  # pw is the power of gamma
#         for r in rewards[t:]:
#             # at each time step, the total discounted reward is calculated 
#             # by summing the weighted rewards from that time step to the end
#             Gt = Gt + gamma**pw * r
#             pw = pw + 1
#         discounted_rewards.append(Gt)
#     discounted_rewards = torch.tensor(discounted_rewards)
#     discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) # normalize discounted rewards
#     policy_gradient = []
#     for log_prob, Gt in zip(log_probs, discounted_rewards):
#         Gt = Gt.to(dtype=torch.float32).to(device)
#         policy_gradient.append(-log_prob * Gt)
    
#     optimizer.zero_grad()
#     policy_gradient = torch.stack(policy_gradient).sum()
#     policy_gradient.backward()
#     optimizer.step()

Episode 0 completed with reward: 0.24765935426298302
Episode 1 completed with reward: 0.2923474022271939
Episode 2 completed with reward: 0.2671241060341797
Episode 3 completed with reward: 0.5231723382267398
Episode 4 completed with reward: 0.31389737676788687
Episode 5 completed with reward: 0.24610688646542458
Episode 6 completed with reward: 0.3559025659626467
Episode 7 completed with reward: 0.31629535284194304
Episode 8 completed with reward: 0.49949971180482866
Episode 9 completed with reward: 0.26663125537214605
Episode 10 completed with reward: 0.2733747727072241
Episode 11 completed with reward: 0.33061614965393044
Episode 12 completed with reward: 0.2949345029116824
Episode 13 completed with reward: 0.24265667936929516
Episode 14 completed with reward: 0.2867401780391171
Episode 15 completed with reward: 0.30094874320240206
Episode 16 completed with reward: 0.2670030677620757
Episode 17 completed with reward: 0.28871887174529554
Episode 18 completed with reward: 0.3369269808

KeyboardInterrupt: 

# Try 2

In [6]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt

# Constants
GAMMA = 0.9

class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(PolicyNetwork, self).__init__()

        self.num_actions = num_actions
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.softmax(self.linear2(x), dim=1)
        return x 
    
    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.forward(state)
        highest_prob_action = np.random.choice(self.num_actions, p=np.squeeze(probs.detach().numpy()))
        log_prob = torch.log(probs.squeeze(0)[highest_prob_action])
        return highest_prob_action, log_prob

In [7]:
def update_policy(policy_network, rewards, log_probs):
    discounted_rewards = []

    for t in range(len(rewards)):
        Gt = 0 
        pw = 0
        for r in rewards[t:]:
            Gt = Gt + GAMMA**pw * r
            pw = pw + 1
        discounted_rewards.append(Gt)
        
    discounted_rewards = torch.tensor(discounted_rewards)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) # normalize discounted rewards

    policy_gradient = []
    for log_prob, Gt in zip(log_probs, discounted_rewards):
        policy_gradient.append(-log_prob * Gt)
    
    policy_network.optimizer.zero_grad()
    policy_gradient = torch.stack(policy_gradient).sum()
    policy_gradient.backward()
    policy_network.optimizer.step()

In [12]:
env = DraftEnv(df_players)
RL_MGR = 1

policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n, 128)

max_episode_num = 50000
all_rewards = []

for episode in range(max_episode_num):
    state, _ = env.reset()
    log_probs = []
    rewards = []

    for _, row in env.draft.iterrows():

        if row['mgr'] == RL_MGR:
            action, log_prob = policy_net.get_action(state)
            state, reward, done, _ = env.step(action)
            log_probs.append(log_prob)
            rewards.append(reward)
        else:
            action = env.reasonable_option()
            # print(f'Action: {action}')
            env.step(action)
            
    
    update_policy(policy_net, rewards, log_probs)
    all_rewards.append(np.sum(rewards))
    if episode % 1 == 0:
        print("episode: {}, total reward: {}, average_reward: {}".format(episode, np.round(np.sum(rewards), decimals = 3), np.mean(all_rewards)))

    

episode: 0, total reward: -0.771, average_reward: -0.7714529277172797
episode: 1, total reward: -0.805, average_reward: -0.7883612665349773
episode: 2, total reward: -0.777, average_reward: -0.7847351419974874
episode: 3, total reward: -0.811, average_reward: -0.7911925969173608
episode: 4, total reward: -0.766, average_reward: -0.7862504329158746
episode: 5, total reward: -0.769, average_reward: -0.7833536270892467
episode: 6, total reward: -0.755, average_reward: -0.7793224489504046
episode: 7, total reward: -0.73, average_reward: -0.7732038143140634
episode: 8, total reward: -0.744, average_reward: -0.769924384251142
episode: 9, total reward: -0.757, average_reward: -0.768664153656015
episode: 10, total reward: -0.817, average_reward: -0.7730348846015165
episode: 11, total reward: -0.734, average_reward: -0.76976425234714
episode: 12, total reward: -0.784, average_reward: -0.770891145448483
episode: 13, total reward: -0.859, average_reward: -0.7772068613710366
episode: 14, total rew

KeyboardInterrupt: 

In [35]:
# show all rows of the dataframe
pd.set_option('display.max_rows', None)
env.draft

Unnamed: 0,round,mgr,sleeper_id,full_name,team,position,team_pos,fp_mean,fp_std
0,0,0,9224,Chase Brown,CIN,RB,FLEX,0.33311,25.849831
1,0,1,11058,Blake Grupe,NO,K,K,0.335265,14.885174
2,0,2,11563,Bo Nix,DEN,QB,QB,0.597119,34.232958
3,0,3,8121,Romeo Doubs,GB,WR,FLEX,0.332166,19.742277
4,0,4,6806,J.K. Dobbins,LAC,RB,FLEX,0.306463,30.171801
5,0,5,9228,Bryce Young,CAR,QB,QB,0.587099,26.063127
6,0,6,9753,Zach Charbonnet,SEA,RB,FLEX,0.298373,25.826435
7,0,7,8676,Rashid Shaheed,NO,WR,FLEX,0.332103,14.852612
8,0,8,11565,J.J. McCarthy,MIN,QB,QB,0.564614,49.52782
9,0,9,11604,Brock Bowers,LV,TE,FLEX,0.327779,36.361816


In [None]:
import torch

policy_net.eval()  # Set the model to evaluation mode

with torch.no_grad():
    predictions = model(input_data)
    
policy_net.train()