In [None]:
!pip install torchrl



# Mining Environment

In [None]:
import numpy as np
import enum
import random
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple
from torchrl.data import ListStorage, ReplayBuffer
from torch.nn.utils import clip_grad_norm_
import matplotlib.pyplot as plt

np.random.seed(42)
random.seed(42)

In [None]:
match_dict_inv = {0: 'irrelevant', 1: 'relevant', 2: 'active'}

In [None]:
class State:
    def __init__(self, l_a, l_h, b_e, match="relevant"):
        self.length_a = l_a
        self.length_h = l_h
        self.blocks_e = b_e
        self.match = match
        self.match_dict = {'irrelevant': 0, 'relevant': 1, 'active': 2}


    def __hash__(self):
        return hash((self.length_a, self.length_h, self.blocks_e, self.match))

    def __eq__(self, other):
        try:
            return (self.length_a, self.length_h, self.blocks_e, self.match) == (other.length_a, other.length_h, other.blocks_e, other.match)
        except:
            return False

    def __ne__(self, other):
        return not (self == other)

    def __repr__(self):
        return "(%d, %d, %d, %s)" % (self.length_a, self.length_h, self.blocks_e, self.match)

    def to_numpy(self):
      return np.array([self.length_a, self.length_h, self.blocks_e, self.match_dict[self.match]])


In [None]:
class Fork(enum.IntEnum):
  irrelevant = 0,
  relevant = 1,
  active = 2

class Action(enum.IntEnum):
  wait = 0,
  adopt = 1,
  override = 2,
  match = 3

class Miner():
  def __init__(self, id, type, mines_on, mining_power, is_stale=False):
    self.id = id
    self.type = type
    self.mines_on = mines_on
    self.mining_power = mining_power
    self.is_stale = is_stale

  def __repr__(self):
    return "(%d, %s, %s, %.2f, %d)" % (self.id, self.type, self.mines_on, self.mining_power, self.is_stale)

  def __eq__(self, other):
    if isinstance(other, Miner):
        return self.id == other.id
    return False

  def __hash__(self):
    return hash(self.id)

action_size = 4
state_size = 4

def to_state_obj(state):
  return State(state[0], state[1], state[2], match_dict_inv[state[3]])

In [None]:
def get_allowed_actions(state, cutoff):
  state = to_state_obj(state.cpu().detach().view(-1).numpy())
  allowed_actions = [Action.adopt, Action.wait]
  if state.length_a == cutoff or state.length_h == cutoff:
    allowed_actions.remove(Action.wait)
  if state.length_a > state.length_h:
    allowed_actions.append(Action.override)
  if state.length_a >= state.length_h and state.length_h > 0 and state.match == 'relevant':
    allowed_actions.append(Action.match)
  return list(set(allowed_actions))

In [None]:
class BitcoinSimulator():
  def __init__(self, p, stale, gamma, cost=0, cutoff=20, lam=0):
    self.p = p
    self.stale = stale
    self.gamma = gamma
    self.cost = cost
    self.cutoff = cutoff
    self.lam = lam
    self.match_cases = ["irrelevant", "relevant", "active"]
    self.a_cost = cost * p
    self.h_cost = cost * (1 - p)
    self.q = 1 - p - lam
    state_count = 0
    self.states = {}
    self.states_inverted = {}
    for l_a in range(self.cutoff + 1):
      for l_h in range(self.cutoff + 1):
        for b_e in range(l_a + 1):
          if self.lam == 0 and b_e > 0:
            continue
          for match in self.match_cases:
            state = State(l_a, l_h, b_e, match)
            self.states[state_count] = state
            self.states_inverted[state] = state_count
            state_count += 1
    self.states_counter = state_count

    self.S = state_count
    self.A = action_size

  def get_curr_state():
    return self.current_state.to_numpy()

  def set_blockchain_state(self, state):
    self.current_state = state
    self.l_a = state.length_a
    self.l_h = state.length_h
    self.b_e = state.blocks_e
    self.match = state.match

  def reset(self):
    # probs = [self.p, self.lam, self.q * (1 - self.stale), self.q * self.stale]
    # states = [State(1, 0, 0, 'irrelevant'), State(1, 0, 1, 'irrelevant'), State(0, 1, 0, 'relevant'), ]
    self.set_blockchain_state(State(0, 0, 0, 'irrelevant'))
    return self.current_state.to_numpy()

  def create_miners(self, num_miners):

    if self.lam != 0:
      mining_powers = np.random.random(num_miners-2)
      mining_powers = mining_powers / mining_powers.sum() * (1-self.p-self.lam)
      mining_powers = np.append(mining_powers, self.p)
      mining_powers = np.append(mining_powers, self.lam)
      h_mining_powers = mining_powers[:-2]
      a_mining_powers = mining_powers[-2]
      e_mining_powers = mining_powers[-1]
    else:
      mining_powers = np.random.random(num_miners-1)
      mining_powers = mining_powers / mining_powers.sum() * (1-self.p)
      mining_powers = np.append(mining_powers, self.p)
      h_mining_powers = mining_powers[:-1]
      a_mining_powers = mining_powers[-1]

    # number of honest miners
    n_h_miners = len(h_mining_powers)
    h_miners = [Miner(idx, 'H', 'H', mi) for idx, mi in enumerate(h_mining_powers)]
    if self.stale != 0:
      # number of stale miners
      n_h_s_miners = int(n_h_miners * self.stale)
      # number of non stale miners
      n_h_ns_miners = n_h_miners - n_h_s_miners


      # stale miner list
      h_s_miners = random.sample(h_miners, n_h_s_miners)
      h_s_miners = list(map(lambda miner: Miner(miner.id, miner.type, miner.mines_on, miner.mining_power, True), h_s_miners))
      # non stale miner list
      h_ns_miners = list(set(h_miners) - set(h_s_miners))
    else:
      n_h_ns_miners = n_h_miners
      h_ns_miners = h_miners

    # gamma fraction of non stale miners mines on adversary chain
    # number of honest miners mines on adversary chain
    n_h_a_miners = int(n_h_ns_miners * self.gamma)

    # number of honest miners mines on public chain
    n_h_h_miners = n_h_ns_miners - n_h_a_miners

    # list of honest miners mines on adversary chain
    h_a_miners = random.sample(h_ns_miners, n_h_a_miners)
    h_a_miners = list(map(lambda miner: Miner(miner.id, miner.type, 'A', miner.mining_power, miner.is_stale), h_a_miners))

    # list of honest miners mines on honest chain
    h_h_miners = list(set(h_ns_miners) - set(h_a_miners))

    # all honest miners
    if self.stale != 0:
      miners = h_s_miners + h_a_miners + h_h_miners
    else:
      miners = h_a_miners + h_h_miners

    a_miners = [Miner(len(miners), 'A', None, a_mining_powers, False)]
    if self.lam != 0:
      e_miners = [Miner(len(miners)+1, 'E', None, e_mining_powers, False)]
      miners += a_miners + e_miners
    else:
      miners += a_miners
    miners = sorted(miners, key=lambda miner: miner.id)
    self.miners = miners
    self.mining_powers = mining_powers

  def mine_next_block(self):
    # Generate block discovery times for each miner using exponential distribution
    discovery_times = np.random.exponential(scale=1/self.mining_powers)
    # Determine the miner who finds the first block
    first_miner = self.miners[np.argmin(discovery_times)]
    return first_miner

  def step(self, action):
    winner = self.mine_next_block()

    if action == 0:
      if self.match == 'active' and self.l_a >= self.l_h and self.l_h > 0:
        payout = (self.l_h)*(self.l_a - self.b_e)//self.l_a
        new_b_e = self.b_e - (self.l_h - payout)

        # generate probability matrix
        if winner.type == 'A':
          wait_probs = [1, 0, 0, 0, 0]
        elif winner.type == 'E':
          wait_probs = [0, 1, 0, 0, 0]
        elif winner.type == 'H' and winner.mines_on == 'A' and winner.is_stale == False:
          wait_probs = [0, 0, 1, 0, 0]
        elif winner.type == 'H' and winner.mines_on == 'H' and winner.is_stale == False:
          wait_probs = [0, 0, 0, 1, 0]
        elif winner.type == 'H' and winner.is_stale == True:
          wait_probs = [0, 0, 0, 0, 1]
        wait_reward_states = [((-self.a_cost, -self.h_cost), State(self.l_a + 1, self.l_h, self.b_e, "active")), ((-self.a_cost, -self.h_cost), State(self.l_a + 1, self.l_h, self.b_e+1, "active")), ((payout - self.a_cost, self.b_e - new_b_e - self.h_cost), State(self.l_a - self.l_h, 1, new_b_e, "relevant")), ((-self.a_cost, -self.h_cost), State(self.l_a, self.l_h + 1, self.b_e, "relevant")), ((-self.a_cost, -self.h_cost), State(self.l_a, self.l_h, self.b_e, "active"))]
        next_reward_state_idx = np.random.choice(range(len(wait_reward_states)), p=wait_probs)
        next_reward_state = wait_reward_states[next_reward_state_idx]
        self.set_blockchain_state(next_reward_state[1])
        return (self.current_state.to_numpy(), next_reward_state[0][0], next_reward_state[0][1])
      else:
        # generate probability matrix
        if winner.type == 'A':
          wait_probs = [1, 0, 0, 0]
        elif winner.type == 'E':
          wait_probs = [0, 1, 0, 0]
        elif winner.type == 'H' and winner.is_stale == False:
          wait_probs = [0, 0, 1, 0]
        elif winner.type == 'H' and winner.is_stale == True:
          wait_probs = [0, 0, 0, 1]
        wait_reward_states = [((-self.a_cost, -self.h_cost), State(self.l_a + 1, self.l_h, self.b_e, "irrelevant")), ((-self.a_cost, -self.h_cost), State(self.l_a + 1, self.l_h, self.b_e+1, "irrelevant")), ((-self.a_cost, -self.h_cost), State(self.l_a, self.l_h + 1, self.b_e, "relevant")), ((-self.a_cost, -self.h_cost), State(self.l_a, self.l_h, self.b_e, "irrelevant"))]
        next_reward_state_idx = np.random.choice(range(len(wait_reward_states)), p=wait_probs)
        next_reward_state = wait_reward_states[next_reward_state_idx]

        self.set_blockchain_state(next_reward_state[1])
        return (self.current_state.to_numpy(), next_reward_state[0][0], next_reward_state[0][1])

    elif action == 1:
      # generate probability matrix
      if winner.type == 'A':
        adopt_probs = [1, 0, 0, 0]
      elif winner.type == 'E':
        adopt_probs = [0, 1, 0, 0]
      elif winner.type == 'H' and winner.is_stale == False:
        adopt_probs = [0, 0, 1, 0]
      elif winner.type == 'H' and winner.is_stale == True:
        adopt_probs = [0, 0, 0, 1]
      adopt_reward_states = [((-self.a_cost, self.l_h - self.h_cost), State(1, 0, 0, "irrelevant")), ((-self.a_cost, self.l_h - self.h_cost), State(1, 0, 1, "irrelevant")), ((-self.a_cost, self.l_h - self.h_cost), State(0, 1, 0, "relevant")), ((-self.a_cost, self.l_h - self.h_cost), State(0, 0, 0, "irrelevant"))]
      next_reward_state_idx = np.random.choice(range(len(adopt_reward_states)), p=adopt_probs)
      next_reward_state = adopt_reward_states[next_reward_state_idx]
      self.set_blockchain_state(next_reward_state[1])
      return (self.current_state.to_numpy(), next_reward_state[0][0], next_reward_state[0][1])

    elif action == 2:
      payout = (self.l_h+1)*(self.l_a - self.b_e)//self.l_a
      new_b_e = self.b_e - (self.l_h+1 - payout)
      if winner.type == 'A':
        override_probs = [1, 0, 0, 0]
      elif winner.type == 'E':
        override_probs = [0, 1, 0, 0]
      elif winner.type == 'H' and winner.is_stale == False:
        override_probs = [0, 0, 1, 0]
      elif winner.type == 'H' and winner.is_stale == True:
        override_probs = [0, 0, 0, 1]

      override_reward_states = [((payout - self.a_cost, self.b_e - new_b_e - self.h_cost), State(self.l_a - self.l_h, 0, new_b_e, "irrelevant")),((payout - self.a_cost, self.b_e - new_b_e - self.h_cost), State(self.l_a - self.l_h, 0, new_b_e + 1, "irrelevant")), ((payout - self.a_cost, self.b_e - new_b_e - self.h_cost), State(self.l_a-self.l_h-1, 1, new_b_e, "relevant")), ((payout - self.a_cost, self.b_e - new_b_e - self.h_cost), State(self.l_a-self.l_h-1, 0, new_b_e, "irrelevant"))]
      next_reward_state_idx = np.random.choice(range(len(override_reward_states)), p=override_probs)
      next_reward_state = override_reward_states[next_reward_state_idx]
      self.set_blockchain_state(next_reward_state[1])
      return (self.current_state.to_numpy(), next_reward_state[0][0], next_reward_state[0][1])

    elif action == 3:

      payout = (self.l_h)*(self.l_a - self.b_e)//self.l_a
      new_b_e = self.b_e - (self.l_h - payout)
      if winner.type == 'A':
        match_probs = [1, 0, 0, 0, 0]
      elif winner.type == 'E':
        match_probs = [0, 1, 0, 0, 0]
      elif winner.type == 'H' and winner.mines_on == 'A' and winner.is_stale == False:
        match_probs = [0, 0, 1, 0, 0]
      elif winner.type == 'H' and winner.mines_on == 'H' and winner.is_stale == False:
        match_probs = [0, 0, 0, 1, 0]
      elif winner.type == 'H' and winner.is_stale == True:
        match_probs = [0, 0, 0, 0, 1]

      match_reward_states = [((-self.a_cost, -self.h_cost), State(self.l_a + 1, self.l_h, self.b_e, "active")),((-self.a_cost, -self.h_cost), State(self.l_a + 1, self.l_h, self.b_e+1, "active")), ((payout - self.a_cost, self.b_e - new_b_e - self.h_cost), State(self.l_a - self.l_h, 1, new_b_e, "relevant")),((-self.a_cost, -self.h_cost), State(self.l_a, self.l_h + 1, self.b_e, "relevant")), ((-self.a_cost, -self.h_cost), State(self.l_a, self.l_h, self.b_e, "active"))]
      next_reward_state_idx = np.random.choice(range(len(match_reward_states)), p=match_probs)
      next_reward_state = match_reward_states[next_reward_state_idx]
      self.set_blockchain_state(next_reward_state[1])
      return (self.current_state.to_numpy(), next_reward_state[0][0], next_reward_state[0][1])


In [None]:
# Creating the architecture of the network
class Network(nn.Module):
  # number of input neurons = size of the dimensions of the state (8)
  # number of actions
  # random seed
  def __init__(self, state_size, action_size, seed = 42):
    super(Network, self).__init__()
    # Sets the seed for generating random numbers
    self.seed = torch.manual_seed(seed)
    # first full connection between the input layer and hidden layer
    self.fc1 = nn.Linear(state_size, 2048)
    # second full connection layer between first hidden layer and second layer
    self.fc2 = nn.Linear(2048, 1024)
    self.fc3 = nn.Linear(1024, 512)
    self.fc4 = nn.Linear(512, 256)
    # connetion between the second hidden layer and the output layer
    self.fc5 = nn.Linear(256, action_size)

  # forward propogation from input layer to the output layer
  def forward(self, state):
    # applying activation function (propogate the signal from input layer
    # to the first hidden layer applying activation function)
    x = self.fc1(state)
    x = F.relu(x)
    x = self.fc2(x)
    x = F.relu(x)
    x = self.fc3(x)
    x = F.relu(x)
    x = self.fc4(x)
    x = F.relu(x)
    return self.fc5(x)

In [None]:
learning_rate = 5e-5
# size of each batch where the model will be trained
minibatch_size = 400
discount_factor = 0.999
# size of the replay buffer
replay_buffer_size = int(1e12)
interpolation_parameter = 0.01

In [None]:
class Agent():
  def __init__(self, state_size, action_size):
    self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    self.state_size = state_size
    self.action_size = action_size
    # local q network of adversary
    self.a_local_qnetwork = Network(state_size, action_size).to(self.device)
    # target q network of adversary
    self.a_target_qnetwork = Network(state_size, action_size).to(self.device)
    # local q network of honest
    self.h_local_qnetwork = Network(state_size, action_size).to(self.device)
    # target q network of honest
    self.h_target_qnetwork = Network(state_size, action_size).to(self.device)

    # parameters are the weights of the network
    self.a_optimizer = optim.AdamW(self.a_local_qnetwork.parameters(), lr=learning_rate)
    self.h_optimizer = optim.AdamW(self.h_local_qnetwork.parameters(), lr=learning_rate)

    # replay memory
    self.memory = ReplayBuffer(storage=ListStorage(900000))
    # timestep to decide when to learn from the experirences
    self.t_step = 0
    self.temp = 1e-9

# method to store exp and decide when to learn from them
  def step(self, state, action, reward_a, reward_h, next_state):
    self.memory.add((state, next_state, action, reward_a, reward_h))
    # when timestep reaches 4 the model will learn by taking a minibatch from the
    # replay buffer
    self.t_step = (self.t_step + 1) % 20
    if self.t_step == 0:
      # check if there are at least 100 exp in the buffer
      if len(self.memory) > minibatch_size:
        experiences = self.memory.sample(400)
        self.learn(experiences, discount_factor)


  def act(self, state, eps_t):
    # adding an extra dimension corresponding to the batch (it indicates to which batch this state belogns to)
    # note that always the batch index should be at the beginning
    state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
    # set the local network to evaluation mode before forward pass
    # because as this is the forward pass we are making predictions
    self.a_local_qnetwork.eval()
    self.h_local_qnetwork.eval()
    # since we are in forward there is no need to calculate gradients
    with torch.no_grad():
      # predicting q values (forward pass)
      a_action_values = self.a_local_qnetwork(state).cpu().data.squeeze(0)
      h_action_values = self.h_local_qnetwork(state).cpu().data.squeeze(0)
      # print(a_action_values, h_action_values)
      # relative_rev = torch.max(a_action_values / (a_action_values + h_action_values)).item()
    # resetting model to traning mode

    self.a_local_qnetwork.train()
    self.h_local_qnetwork.train()

    # select an action based on epsilon greedy policy
    # we generate a random number R and if R > epsilon ? we choose the maximum predicted q value
    # : select a random aciton
    rel_actions = a_action_values / (a_action_values + h_action_values + self.temp)

    mask_value = float("-inf")
    # unallowed actions
    diff = list(set([0, 1, 2, 3]) - set(get_allowed_actions(state, 20)))
    if diff:
      rel_actions[torch.tensor(diff)] = mask_value

    tensor = rel_actions / eps_t
    # replaced_tensor = torch.where(torch.isposinf(tensor), torch.tensor(1.0), tensor)
    tensor -= torch.max(tensor)
    probabilities = F.softmax(tensor, dim=-1)
    act = torch.multinomial(probabilities, 1).item()
    return act


  # allows agent to learn based on the minibatch
  def learn(self, experiences, discount_factor):
    states, next_states, actions, rewards_a, rewards_h = experiences

    states = states.to(dtype=torch.float32, device=self.device)
    next_states = next_states.to(dtype=torch.float32, device=self.device)
    actions = actions.unsqueeze(1).to(dtype=torch.long, device=self.device)
    rewards_a = rewards_a.unsqueeze(1).to(dtype=torch.float32, device=self.device)
    rewards_h = rewards_h.unsqueeze(1).to(dtype=torch.float32, device=self.device)

    # to compute the target q value we need the maxium q value for the next state
    # use the target network to get the q values for all the actions from that next state
    # next_q_targets = self.a_target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)
    # (100, 4)

    next_a_q_targets = self.a_target_qnetwork(next_states).detach()
    next_h_q_targets = self.h_target_qnetwork(next_states).detach()
    # (100, 1)
    a_ = torch.stack([next_a_q_targets[idx] / (next_a_q_targets[idx] + next_h_q_targets[idx] + self.temp) for idx, _ in enumerate(next_a_q_targets)])
    # (100, 4
    mask_value = float("-inf")
    # unallowed actions
    for idx, state in enumerate(next_states):
      diff = list(set([0, 1, 2, 3]) - set(get_allowed_actions(state, 20)))
      if diff:
        a_[idx][torch.tensor(diff)] = mask_value

    a_ = a_.max(1)[1].unsqueeze(1).view(-1, 1)
    # (100, 1)
    q_a_targets = rewards_a + (discount_factor * next_a_q_targets.gather(1, a_))
    q_h_targets = rewards_h + (discount_factor * next_h_q_targets.gather(1, a_))
    # q_targets = rewards + (discount_factor * next_q_targets * (1 - dones))
    # forward propogate the states to get the predicted q values

    q_a_expected = self.a_local_qnetwork(states).gather(1, actions)
    q_h_expected = self.h_local_qnetwork(states).gather(1, actions)
    # loss (mean squared error)

    loss_a = F.mse_loss(q_a_expected, q_a_targets)
    loss_h = F.mse_loss(q_h_expected, q_h_targets)

    # delta_a = q_a_expected - q_a_targets
    # delta_h = q_h_expected - q_h_targets

    # priorities_a = (delta_a.abs().cpu().detach().numpy().flatten())
    # priorities_h = (delta_h.abs().cpu().detach().numpy().flatten())
    # priorities = priorities_a / (priorities_a + priorities_h + self.temp)
    # self.memory.update_priority(info['index'], priorities)

    # backpropogating the error to update the weights
    self.a_optimizer.zero_grad()
    self.h_optimizer.zero_grad()

    loss_a.backward()
    loss_h.backward()

    # max_grad_norm = 0.5
    # clip_grad_norm_(self.a_local_qnetwork.parameters(), max_grad_norm)
    # clip_grad_norm_(self.h_local_qnetwork.parameters(), max_grad_norm)

    # single optimization step for updating the weights
    self.a_optimizer.step()
    self.h_optimizer.step()

    # updating the target network weights
    self.soft_update(self.a_local_qnetwork, self.a_target_qnetwork, interpolation_parameter)
    self.soft_update(self.h_local_qnetwork, self.h_target_qnetwork, interpolation_parameter)

  def soft_update(self, local_model, target_model, interpolation_parameter):
    for target_param, local_param in zip(target_model.parameters(),local_model.parameters()):
      # softly update the target model parameters with the weighted average of the local and target params
      target_param.data.copy_(interpolation_parameter * local_param.data + (1.0 - interpolation_parameter) * target_param.data)

In [None]:
p = 0.45
stale = 0.2
gamma = 0.5
cost = 0
lam = 0.2
# setup up the environment
env = BitcoinSimulator(p=p, stale=stale, gamma = gamma, cost = cost, cutoff=20, lam=lam)
env.create_miners(1000)

"""
[D]p = 0.45 stale = 0.1 gamma = 0.5 cost = 0 lam = 0.2
p = 0.45 stale = 0.01 gamma = 0.5 cost = 0 lam = 0.3
"""

'\n[D]p = 0.45 stale = 0.1 gamma = 0.5 cost = 0 lam = 0.2\np = 0.45 stale = 0.01 gamma = 0.5 cost = 0 lam = 0.3\n'

In [None]:
agent = Agent(state_size, action_size)

In [None]:
state = env.reset()
a_score = 0
h_score = 0
temperature = 1
temperature_decay = 0.9998
min_temperature = 1e-5
number_episodes = 600000
window_size = 10000
r_a = []
r_h = []

for episode in range(1, number_episodes + 1):
    action = agent.act(state, temperature)
    next_state, reward_a, reward_h = env.step(action)
    agent.step(torch.tensor(state), torch.tensor(action), torch.tensor(reward_a), torch.tensor(reward_h), torch.tensor(next_state))
    state = next_state
    a_score += reward_a
    h_score += reward_h
    r_a.append(reward_a)
    r_h.append(reward_h)

    if episode >= window_size:
        window_a = np.sum(r_a[episode - window_size:episode])
        window_h = np.sum(r_h[episode - window_size:episode])
        rel_gain = window_a / (window_a + window_h) if (window_a + window_h) != 0 else 0
        print(rel_gain)

    if episode % 1000 == 0:
        print(a_score, h_score)
        rel = a_score / (a_score + h_score)
        print('\rEpisode {}\tGain: {:.4f}'.format(episode, rel))

    if episode % 10000 == 0:
        np.save('r_a.npy', r_a)
        np.save('r_h.npy', r_h)

    temperature = max(min_temperature, temperature * temperature_decay)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0.7920326864147089
0.7919901920719248
0.7919901920719248
0.7919901920719248
0.7918709150326797
0.7918283963227783
0.7918283963227783
0.7918283963227783
0.7917858602370249
0.7919134163773739
0.7919134163773739
0.7918283963227783
0.7918283963227783
0.7918283963227783
0.7915050030630999
0.791547570436913
0.7915901204327414
0.7916326530612245
0.791955900367497
0.791955900367497
0.791955900367497
0.7920408163265306
0.7920832483166701
0.7920832483166701
0.7920832483166701
0.7920832483166701
0.7920832483166701
0.79221044045677
0.79221044045677
0.79221044045677
0.7922951487973909
0.7922951487973909
0.7923374770735684
0.7925336597307222
0.7925336597307222
0.7925759738935346
0.7925759738935346
0.7925759738935346
0.7926182707993474
0.7926182707993474
0.7926182707993474
0.7926182707993474
0.7926605504587156
0.7925759738935346
0.7925759738935346
0.7925759738935346
0.7924913283003469
0.7924913283003469
0.7925336597307222
0.792575973893

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d

# Load rewards
r_a = np.load('r_a.npy')  # Adversary rewards
r_h = np.load('r_h.npy')  # Honest miner rewards

optimal = 0.8118
T_w = 100000  # Window size
k = 10  # Sampling interval
T = len(r_a)

# Calculate RMG values
rmg_values = []
timesteps = []

for t in range(0, T - T_w + 1, k):
    r_a_window = np.sum(r_a[t:t + T_w])
    r_h_window = np.sum(r_h[t:t + T_w])
    rmg = r_a_window / (r_a_window + r_h_window)
    rmg_values.append(rmg)
    timesteps.append(t)

# Apply Gaussian filter for appearance only
smoothed_rmg_values = gaussian_filter1d(rmg_values, sigma=2)

# Plot the results
plt.figure(figsize=(10, 3))
plt.plot(timesteps, rmg_values, linestyle='-', lw=3, label="Raw RMG")
# plt.plot(timesteps, smoothed_rmg_values, linestyle='-', lw=2, label="Smoothed RMG", color='blue')

# Highlight the optimal line
plt.axhline(y=optimal, color='#ffcc00', linestyle='--', linewidth=2, label='Optimal Policy')

plt.xlabel("Timesteps")
plt.ylabel("RMG")
plt.ylim([0.4, 0.9])
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.title("RMG vs Timesteps")
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 1 ---- 0.7750167846679688
# 2 ---- 0.7699813842773438
def plot_convergence(gamma, stale=0, lam=0):
  p = 0.45
  window_size=10
  initial_window_size = 1
  later_window_size = 10
  switch_point = 6 # point to switch window size
  fig, ax = plt.subplots()
  optimal_dict = 0.77

  rl_revenues = np.load('rl_revenues_%.3fhashrate%.2fgamma%.3fstale%.2flam.npy' % (p, gamma, stale, lam))
  rl_initial_moving_avg = np.convolve(rl_revenues[:switch_point], np.ones(initial_window_size)/initial_window_size, mode='valid')
  rl_later_moving_avg = np.convolve(rl_revenues[switch_point:], np.ones(later_window_size)/later_window_size, mode='valid')
  rl_moving_averages = np.concatenate((rl_initial_moving_avg, rl_later_moving_avg))
  print(rl_later_moving_avg)
  ax.plot(rl_moving_averages, '-',linewidth='2', label='DRL Mining')
  plt.axhline(y=optimal_dict, color='r', linestyle='--', label='optimal revenue')
  ax.set_ylim([0.2, 0.9])
  ax.set_xlim([0, len(rl_moving_averages)])
  ax.set_xlabel(r'Timesteps $\times 10^4$')
  ax.set_ylabel('Mining Reward')
  ax.legend(loc='lower right')
  # plt.show()
  plt.savefig('rl_revenues_%.3fhashrate%.2fgamma%.3fstale%.2flam.png' % (p, gamma, stale, lam))

# plot_convergence(gamma=0.5, stale=0.1, lam=0.2)
plot_convergence(gamma=0.5, stale=0.1, lam=0.3)

# plot_convergence(gamma=0.5)
# plot_convergence(gamma=0)
