### Basic Game

In [1]:
import gym
from gym import spaces
import numpy as np

In [198]:
import random
import numpy as np
from collections import defaultdict
import xxhash

from pettingzoo.classic import tictactoe_v3

In [4]:
env = tictactoe_v3.env()

In [5]:
env.reset()

observation, reward, done, info = env.last()

observation

{'observation': array([[[0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0]]], dtype=int8),
 'action_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int8)}

In [29]:
import hashlib



encode_state(observation['observation'])

'65ea394aefe804468cc42b20ecc8b606'

In [275]:
class RandomAgent():
    
    def __init__(self):
        pass
    
    def select_policy(self, observation):
        action = random.choice(np.flatnonzero(observation['action_mask']))
        return action
    
    def compute_score(self, observation, action):
        return 1 / len(np.flatnonzero(observation['action_mask']))
    
    def reset_game(self):
        pass
    
    def propagate_reward(self, reward):
        pass

In [288]:
class MCTSAgent():
    
    def __init__(self, cost_param):
        self.c = cost_param
        self.default_node_dict = {'avg_reward': 0, 'parent_visits': 0, 'self_visits': 0}
        self.policy_tree = defaultdict(lambda: self.default_node_dict.copy())
        self.forward_observations = []
        self.forward_actions = []
    
    def reset_game(self):
        self.forward_observations = []
        self.forward_actions = []
    
    def encode_state(self,observation, action):
        obs_bytes = observation['observation'].tobytes() + observation['action_mask'].tobytes() + action.tobytes()
        m = xxhash.xxh64(obs_bytes)
        state = m.hexdigest()
        return state
    
    def select_policy(self, observation, use_ucb = True):
        max_score = -100 # start from less than worst case
        max_score_index = [0]
        for action in np.nonzero(observation['action_mask'])[0]:
            score = self.compute_score(observation, action, use_ucb)
            if score > max_score:
                max_score = score
                max_score_index = [action]
            elif score == max_score:
                max_score_index.append(action)
        action = random.choice(max_score_index)
        self.forward_observations.append(observation)
        self.forward_actions.append(action)
        return action
    
    def compute_score(self, observation, action, use_ucb = True):
        state_hash = self.encode_state(observation, action)
        node_dict = self.policy_tree[state_hash]
        
        if node_dict['self_visits'] > 0:
            if use_ucb:
                ucb = self.c * (np.log(node_dict['parent_visits']) / node_dict['self_visits']) ** .5
            else:
                ucb = 0
            node_score = node_dict['avg_reward'] + ucb
        else:
            node_score = 100 # must exceed the largest possible score so that each option is tried at least once
            
        return node_score
    
    def propagate_reward(self, reward):
        for index, observation in enumerate(self.forward_observations):
            action = self.forward_actions[index]
            chosen_hash = self.encode_state(observation, action)
            self.policy_tree[chosen_hash]['avg_reward'] = (self.policy_tree[chosen_hash]['avg_reward'] * self.policy_tree[chosen_hash]['self_visits'] + reward) / (self.policy_tree[chosen_hash]['self_visits'] + 1)
            self.policy_tree[chosen_hash]['parent_visits'] += 1 
            self.policy_tree[chosen_hash]['self_visits'] += 1
            for possible_action in np.nonzero(observation['action_mask'])[0]:
                if possible_action != action:
                    unchosen_hash = self.encode_state(observation, possible_action)
                    self.policy_tree[unchosen_hash]['parent_visits'] += 1
        
        self.forward_observations = []
        self.forward_actions = []
        
    

In [209]:
agent = MCTSAgent(cost_param = 1)

In [210]:
agent.select_policy(observation)
agent.propagate_reward(1)

In [211]:
agent1 = MCTSAgent(1)
agent2 = MCTSAgent(2)

In [295]:
agent3 = MCTSAgent(.5)
agent4 = MCTSAgent(.5)

In [285]:
def play_game(env, agents, render = False):
    env.reset()
    rewards = defaultdict(int)
    for agent in env.agent_iter():
        observation, reward, done, info = env.last()
        rewards[agent] += reward
        action = agents[agent].select_policy(observation) if not done else None
        env.step(action)
        if render:
            env.render() # this visualizes a single game
            print()
    return rewards

In [286]:
def play_demonstration_game(env, agents):
    env.reset()
    rewards = defaultdict(int)
    for agent in env.agent_iter():
        observation, reward, done, info = env.last()
        rewards[agent] += reward
        action = agents[agent].select_policy(observation) if not done else None
        env.render() # this visualizes a single game
        scores = {}
        for pos_action in np.nonzero(observation['action_mask'])[0]:
            scores[pos_action] = agents[agent].compute_score(observation, pos_action)
        env.step(action)
        print(scores)
    return rewards

In [319]:
agents = {'player_1':agent1, 'player_2':agent2}

In [320]:
%%time 

p1_rewards = {1: 0, 0: 0, -1:0}
for i in range(10001):
    if i%10000 == 0:
        print(f'After {i} games: {p1_rewards}')
        p1_rewards = {1: 0, 0: 0, -1:0}
    
    rewards = play_game(env, agents, render = False)
    p1_rewards[rewards['player_1']] += 1    
    
    for key, agent in agents.items():
        agent.propagate_reward(rewards[key])

After 0 games: {1: 0, 0: 0, -1: 0}
After 10000 games: {1: 11, 0: 9989, -1: 0}
Wall time: 8.53 s


In [321]:
for agent in agents.values():
    agent.reset_game()
rewards = play_demonstration_game(env, agents)
for key, agent in agents.items():
    agent.propagate_reward(rewards[key])

     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
{0: 0.06399164781115207, 1: 0.05686147380825404, 2: 0.06399092192277939, 3: 0.06406613739732926, 4: 0.23634821292035865, 5: 0.06407086576378757, 6: 0.06395604425044674, 7: 0.0641280745077211, 8: 0.059810272992660375}
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
{0: -0.29374155208673647, 1: -0.26694373804601124, 2: 0.0038782760508702125, 3: -0.2790117308205139, 5: -0.27222192943269496, 6: -0.288898151717828, 7: -0.26774533132200246, 8: -0.29198554260945875}
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  O  |  -  |  -  
     |     |     
{0: 0.036680331453687424, 1: 0.03665929404751418, 3: 0.03668096988668256, 5: 0.03665945386756432, 

In [274]:
for agent in agents.values():
    agent.reset_game()
rewards = play_demonstration_game(env, agents)

     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
{0: 0.06387277343600734, 1: 0.056725266960116635, 2: 0.06387208184673905, 3: 0.06393260703861535, 4: 0.11021616338320817, 5: 0.06393644605461649, 6: 0.0638442565712733, 7: 0.06398222535551815, 8: 0.059675509904091414}
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
{0: -0.29474901256293157, 1: -0.2684447460422753, 2: 0.0036520817173287586, 3: -0.28028903300307684, 5: -0.273608331956229, 6: -0.290000678304427, 7: -0.2692143993238724, 8: -0.2930225129025134}
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  O  |  -  |  -  
     |     |     
{0: 0.03657528215427307, 1: 0.03657555699744343, 3: 0.036575159384553704, 5: 0.03657606076592355, 6: 

In [264]:
for agent in agents.values():
    agent.reset_game()
    
p1_rewards = {1: 0, 0: 0, -1:0}

for i in range(100000):
    rewards = play_game(env, agents, render = False)
    p1_rewards[rewards['player_1']] += 1
    for key, agent in agents.items():
        agent.propagate_reward(rewards[key])
    
p1_rewards

{1: 211, 0: 99748, -1: 41}

In [276]:
agentRand = RandomAgent()

In [324]:
agents = {'player_1': agentRand, 'player_2': agent2}

In [339]:
for agent in agents.values():
    agent.reset_game()
    
p1_rewards = {1: 0, 0: 0, -1:0}

for i in range(10000):
    rewards = play_game(env, agents, render = False)
    p1_rewards[rewards['player_1']] += 1 
    if rewards['player_2'] != 0:
        agents['player_2'].propagate_reward(rewards['player_2'])

p1_rewards

{1: 221, 0: 2885, -1: 6894}

In [355]:
foo = []
for item in agent1.policy_tree.values():
    foo.append(item['parent_visits'])
foo.sort(reverse = True)
len(foo)

8646

In [337]:
for agent in agents.values():
    agent.reset_game()
rewards = play_demonstration_game(env, agents)

     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
{0: 0.1111111111111111, 1: 0.1111111111111111, 2: 0.1111111111111111, 3: 0.1111111111111111, 4: 0.1111111111111111, 5: 0.1111111111111111, 6: 0.1111111111111111, 7: 0.1111111111111111, 8: 0.1111111111111111}
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
{0: -0.10480987832004501, 1: -0.1076373028362797, 2: -0.10625116893442443, 4: -0.103614874622094, 5: -0.10493219569049378, 6: -0.10809714292139583, 7: -0.10448300329355154, 8: -0.10605935888468987}
     |     |     
  -  |  X  |  -  
_____|_____|_____
     |     |     
  -  |  O  |  -  
_____|_____|_____
     |     |     
  -  |  -  |  -  
     |     |     
{0: 0.14285714285714285, 1: 0.14285714285714285, 2: 0.14285714285714285, 5: 0.14285714285714285, 6: 0.142857

# Testing PyTorch with GPU

In [1]:
import torch
  
print(f"Is CUDA supported by this system? {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
  
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(f"ID of current CUDA device:{torch.cuda.current_device()}")
        
print(f"Name of current CUDA device:{torch.cuda.get_device_name(cuda_id)}")

Is CUDA supported by this system? True
CUDA version: 11.3
ID of current CUDA device:0
Name of current CUDA device:NVIDIA GeForce GTX 1050 Ti with Max-Q Design
