In [17]:
!pip install gym gym-legacy-toytext

Collecting gym-legacy-toytext
  Downloading gym_legacy_toytext-0.0.5-py3-none-any.whl.metadata (938 bytes)
Downloading gym_legacy_toytext-0.0.5-py3-none-any.whl (9.9 kB)
Installing collected packages: gym-legacy-toytext
Successfully installed gym-legacy-toytext-0.0.5


In [23]:
import gym
import gym_toytext  
import numpy as np

env = gym.make("Roulette-v0")

num_actions = env.action_space.n
q_table = np.zeros(num_actions)

learning_rate = 0.1
discount = 0.9
epochs = 50_000
epsilon = 1.0
epsilon_decay = 0.9999
epsilon_min = 0.01

for epoch in range(epochs):
    env.reset()  

    # Epsilon-greedy action selection
    if np.random.rand() < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table)

    new_state, reward, done, info = env.step(action)

    # Update Q-table
    current_q = q_table[action]
    new_q = current_q + learning_rate * (reward - current_q)
    q_table[action] = new_q

    # Decay epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

env.close()

print("\n Training finished!")
print("Final Q-values (expected reward for each bet):")
print(q_table.round(4))

best_action = np.argmax(q_table)
print(f"\nBest bet to make: Action {best_action} (Expected reward: {q_table[best_action]:.2f})")



 Training finished!
Final Q-values (expected reward for each bet):
[-0.9995 -0.3148 -0.0763 -0.0368 -0.1907 -0.005  -0.2966 -0.2057 -0.0795
 -0.0979 -0.5954 -0.5181 -0.1284 -0.1099 -0.0922 -0.2147 -0.2561 -0.2633
 -0.0645 -0.3251 -0.5733 -0.1829 -0.3116 -0.1107 -0.2354 -0.3633 -0.4979
 -0.1652 -0.0952 -0.1793 -0.178  -0.3516 -0.1185 -0.1108 -0.0329 -0.1754
 -0.1043  0.    ]

Best bet to make: Action 37 (Expected reward: 0.00)


In [26]:
import gym
import gym_toytext
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random

env = gym.make("Roulette-v0")
state_size = 1  
action_size = env.action_space.n

# Q-network
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
q_network = QNetwork(state_size, action_size).to(device)
optimizer = optim.Adam(q_network.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Replay buffer
memory = deque(maxlen=5000)
batch_size = 64

episodes = 50000
gamma = 0.9
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.9999

# Training loop
for ep in range(episodes):
    env.reset()  
    state = np.array([0.0], dtype=np.float32)  
    done = False

    # Epsilon-greedy 
    if np.random.rand() < epsilon:
        action = env.action_space.sample()
    else:
        with torch.no_grad():
            state_tensor = torch.tensor(state).unsqueeze(0).to(device)
            action = torch.argmax(q_network(state_tensor)).item()

    # Take action
    next_state, reward, done, info = env.step(action)
    next_state = np.array([0.0], dtype=np.float32)  

    # Store in replay memory
    memory.append((state, action, reward, next_state, done))

    if len(memory) >= batch_size:
        batch = random.sample(memory, batch_size)
        states_b, actions_b, rewards_b, next_states_b, dones_b = zip(*batch)

        states_b = torch.tensor(np.array(states_b), dtype=torch.float32).to(device)
        next_states_b = torch.tensor(np.array(next_states_b), dtype=torch.float32).to(device)
        actions_b = torch.tensor(np.array(actions_b)).unsqueeze(1).to(device)
        rewards_b = torch.tensor(np.array(rewards_b), dtype=torch.float32).unsqueeze(1).to(device)
        dones_b = torch.tensor(np.array(dones_b), dtype=torch.float32).unsqueeze(1).to(device)

        # Compute Q targets
        q_values = q_network(states_b).gather(1, actions_b)
        with torch.no_grad():
            q_next = q_network(next_states_b).max(1)[0].unsqueeze(1)
            q_target = rewards_b + gamma * q_next * (1 - dones_b)

        # Update network
        loss = criterion(q_values, q_target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Decay epsilon
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

env.close()

with torch.no_grad():
    state_tensor = torch.tensor([[0.0]], dtype=torch.float32).to(device)
    final_qs = q_network(state_tensor).cpu().numpy()[0]

print("\n DRL Training finished!")
print("Final Q-values (expected reward for each bet):")
print(final_qs.round(4))
best_action = np.argmax(final_qs)
print(f"\n Best bet to make: Action {best_action} (Expected reward: {final_qs[best_action]:.2f})")



 DRL Training finished!
Final Q-values (expected reward for each bet):
[-0.592   0.3222 -0.6101  0.4095 -0.1731  0.2514  0.4283  0.0149  0.3761
  0.2129  0.3196  0.2982  0.3185  0.2362  0.3807  0.4156  0.2812  0.2258
  0.1438  0.1237  0.2043  0.4185  0.3294  0.2778  0.295  -0.1847  0.0484
 -0.08    0.1471 -0.4359  0.2092  0.4015 -0.1454  0.3097  0.3672  0.2217
 -0.3565  0.    ]

 Best bet to make: Action 6 (Expected reward: 0.43)
