In [132]:
# Imports
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from copy import deepcopy
import scipy.spatial.distance
import gensim.models.keyedvectors as word2vec
from game import Game

In [133]:
#master_vectors = Game.load_glove_vecs("players/glove.6B/glove.6B.300d.txt")
#guesser_vectors = Game.load_w2v("players/GoogleNews-vectors-negative300.bin")

In [134]:
# Discount factor
GAMMA = 0.99

# Batch size
BATCH_SIZE = 256
# Capacity of the replay buffer
BUFFER_CAPACITY = 1000 # 10000
# Update target net every ... episodes
UPDATE_TARGET_EVERY = 20 # 20

# Initial value of epsilon
EPSILON_START = 1.0
# Parameter to decrease epsilon
DECREASE_EPSILON = 10000
# Minimum value of epislon
EPSILON_MIN = 0.05

# Number of training episodes
N_EPISODES = 20000
N_TRAINING = 500

# Learning rate
LEARNING_RATE = 0.5

In [135]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, reward, next_state):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = (state, action, reward, next_state)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.choices(self.memory, k=batch_size)

    def __len__(self):
        return len(self.memory)

# create instance of replay buffer
replay_buffer = ReplayBuffer(BUFFER_CAPACITY)

In [136]:
class Net(nn.Module):
    """
    Basic neural net.
    """
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)

In [137]:
total_wordlist = []
with open('players/cm_wordlist.txt') as infile:
    for line in infile:
        total_wordlist.append(line.rstrip())

total_wordlist = np.array(total_wordlist)
#wordlist = random.choices(total_wordlist, k=500)
mask = np.zeros(len(total_wordlist), dtype=bool)
choice = np.random.choice(len(total_wordlist), 500, replace=False)
mask[choice] = 1
clues = np.extract(mask, total_wordlist)

total_board = []
with open('game_wordpool.txt') as infile:
    for line in infile:
        total_board.append(line.rstrip().lower())

total_board = np.array(total_board)
print(len(total_board))
print(len(clues))

395
500


In [138]:
# create network and target network
hidden_size = 128
master_size = len(master_vectors["word"])
guesser_size = len(guesser_vectors["word"])
n_actions = len(clues)

q_net_codemaster = Net(master_size, hidden_size, n_actions)
if torch.cuda.is_available(): 
    q_net_codemaster.cuda()
target_net_codemaster = Net(master_size, hidden_size, n_actions)
if torch.cuda.is_available(): 
    target_net_codemaster.cuda()

# objective and optimizer
objective = nn.MSELoss()
optimizer = optim.SGD(params=q_net_codemaster.parameters(), lr=LEARNING_RATE)

""" q_net_guesser = Net(guesser_size, hidden_size, n_actions)
if torch.cuda.is_available(): 
    q_net_guesser.cuda()

# objective and optimizer
objective_guesser = nn.MSELoss()
optimizer_guesser = optim.Adam(params=q_net_guesser.parameters(), lr=LEARNING_RATE) """

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

False


In [139]:
def get_q(states):
    """
    Compute Q function for a list of states
    """
    with torch.no_grad():
        states_v = torch.FloatTensor([states])
        states_v = states_v.to(device)
        output = q_net_codemaster.forward(states_v).detach().cpu().numpy()  # shape (1, len(states), n_actions)
    return output[0, :, :]  # shape (len(states), n_actions)

In [140]:
def choose_clue(state, epsilon):
    """
    Return action according to an epsilon-greedy exploration policy
    """
    if np.random.uniform()<epsilon:
        #print("EXPLORATION")
        index = np.random.randint(0, len(clues))
        return index
    else:
        q=get_q([state])[0]
        return q.argmax()

In [141]:
def choose_word(board, word_vectors, clue):
    w2v = []

    for word in board:
        try:
            w2v.append((scipy.spatial.distance.cosine(word_vectors[clue], word_vectors[word.lower()]), word))
        except KeyError:
            print(">>> error")
            continue

    w2v = list(sorted(w2v))
    return w2v[1]

In [142]:
def eval_dqn(n_sim=5, test=False, eval_board=None, training=False):
    """
    Monte Carlo evaluation of DQN agent.

    Repeat n_sim times:
        * Run the DQN policy until the environment reaches a terminal state (= one episode)
        * Compute the sum of rewards in this episode
        * Store the sum of rewards in the episode_rewards array.
    """
    episode_rewards = np.zeros(n_sim)

    if eval_board == None :
        eval_board = random.choices(total_board, k=10)

    if training :
        for sim in range(n_sim):
            word_to_guess = eval_board[np.random.randint(0, len(eval_board))]
            #print(">>> To guess : ", word_to_guess)
            state=master_vectors[word_to_guess]
            action=choose_clue(state, 0.0)
            #print(">>> Clue : ", clues[action])
            _, chosen_word = choose_word(eval_board, guesser_vectors, clues[action])
            #print(">>> Chosen : ", chosen_word)
            next_state = None
            reward = 1 if chosen_word == word_to_guess else 0
            episode_rewards[sim]+=reward
            if test :
                print(">>> To guess : ", word_to_guess)
                print(">>> Clue : ", clues[action])
                print(">>> Chosen : ", chosen_word)
            loss = update(state, action, reward, next_state, True)
            print(">>> Loss : ", loss)
            state=next_state
    
    else :
        for sim in range(n_sim):
            word_to_guess = eval_board[np.random.randint(0, len(eval_board))]
            #print(">>> To guess : ", word_to_guess)
            state=master_vectors[word_to_guess]
            action=choose_clue(state, 0.0)
            #print(">>> Clue : ", clues[action])
            _, chosen_word = choose_word(eval_board, guesser_vectors, clues[action])
            #print(">>> Chosen : ", chosen_word)
            next_state = None
            reward = 1 if chosen_word == word_to_guess else 0
            episode_rewards[sim]+=reward
            state=next_state
            if test :
                print(">>> To guess : ", word_to_guess)
                print(">>> Clue : ", clues[action])
                print(">>> Chosen : ", chosen_word)
            
    return episode_rewards

In [143]:
def update(state, action, reward, next_state, done):
    """
    ** TO BE COMPLETED **
    """

    # add data to replay buffer
    if done:
        next_state = None
    replay_buffer.push(state, action, reward, next_state)
    
    if len(replay_buffer) < BATCH_SIZE:
        return np.inf
    
    # get batch
    transitions = replay_buffer.sample(BATCH_SIZE)
    
    states=[transitions[ii][0] for ii in range(BATCH_SIZE)]
    #print(f'STATE : {states}')
    actions=[transitions[ii][1] for ii in range(BATCH_SIZE)]
    #print(f'ACTIONS : {actions}')
    rewards=[transitions[ii][2] for ii in range(BATCH_SIZE)]
    #print(f'REWARDS : {rewards}')
    next_states=[transitions[ii][3] for ii in range(BATCH_SIZE) if transitions[ii][3] is not None]
    #print(f'NEXT_STATES : {next_states}')
    mask=[transitions[ii][3] is not None for ii in range(BATCH_SIZE)]
    #print(f'MASK : {mask}')
    
    #convert to tensor
    states_torch=torch.FloatTensor(states)
    states_torch=states_torch.to(device)
    actions_torch=torch.LongTensor(actions).view(-1,1)
    actions_torch=actions_torch.to(device)
    rewards_torch=torch.FloatTensor(rewards).view(-1,1)
    rewards_torch=rewards_torch.to(device)
    next_states_torch=torch.FloatTensor(next_states)
    next_states_torch=next_states_torch.to(device)
    mask_torch=torch.BoolTensor(mask)
    mask_torch=mask_torch.to(device)
    
    #Q(s_i, a_i)
    values=q_net_codemaster(states_torch)
    values=torch.gather(values, dim=1, index=actions_torch)
    
    # max_a Q(s_{i+1}, a)
    values_next_states=torch.zeros(BATCH_SIZE)
    values_next_states=values_next_states.to(device)
    values_next_states[mask]=0
    values_next_states=values_next_states.view(-1,1)
    
    #targets y_i
    targets=rewards_torch+GAMMA*values_next_states
    
    #print(f'>>> TARGETS : {targets}')
    #print(f'>>> VALUES : {values}')
    
    loss = objective(values, targets)
     
    # Optimize the model - UNCOMMENT!
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


    loss=loss.cpu()
    return loss.detach().numpy()

In [144]:
EVAL_EVERY = 50
REWARD_THRESHOLD = 10

def train():
    epsilon = EPSILON_START
    ep = 0
    total_time = 0
    time_start = 0
    losses = []
    while ep < N_EPISODES:
        rewards = [0]
        board = random.choices(total_board, k=10)
        print("Time = ", ep - time_start)
        time_start = ep
        print("on recommence")
        while np.mean(rewards) < 1 :
            word_to_guess = board[np.random.randint(0, len(board))]
            state = master_vectors[word_to_guess]
            action = choose_clue(state, epsilon)
            chosen_distance, chosen_word = choose_word(board, guesser_vectors, clues[action])
            #print(f'Minimal distance = {chosen_distance}')

            # take action and update replay buffer and networks
            next_state = None
            reward = 1 if chosen_word == word_to_guess else 0
            done = True
            loss = update(state, action, reward, next_state, done)
            if loss < np.inf :
                losses.append(loss)

            # update state
            #state = next_state

            # end episode if done
            if done:
                ep += 1
                if ( (ep+1)% EVAL_EVERY == 0):
                    rewards = eval_dqn(eval_board=board)
                    print("episode =", ep+1, ", rewards = ", rewards)
                    if np.mean(rewards) >= REWARD_THRESHOLD:
                        break

                # update target network
                if ep % UPDATE_TARGET_EVERY == 0:
                    target_net_codemaster.load_state_dict(q_net_codemaster.state_dict())
                # decrease epsilon
                epsilon = EPSILON_MIN + (EPSILON_START - EPSILON_MIN) * \
                                np.exp(-1. * ep / DECREASE_EPSILON )    

        total_time += 1

# Run the training loop
train()

# Evaluate the final policy
rewards = eval_dqn(20)
print("")
print("mean reward after training = ", np.mean(rewards))
rewards = eval_dqn(20, training=True)
print("")
print("mean reward with training = ", np.mean(rewards))

Time =  0
on recommence
episode = 50 , rewards =  [0. 0. 1. 0. 1.]
episode = 100 , rewards =  [0. 0. 0. 0. 0.]
episode = 150 , rewards =  [0. 1. 0. 0. 0.]
episode = 200 , rewards =  [0. 1. 0. 0. 0.]
episode = 250 , rewards =  [0. 0. 1. 0. 1.]
episode = 300 , rewards =  [1. 0. 1. 1. 1.]
episode = 350 , rewards =  [1. 1. 1. 1. 1.]
Time =  349
on recommence
episode = 400 , rewards =  [0. 1. 1. 0. 0.]
episode = 450 , rewards =  [0. 1. 0. 0. 0.]
episode = 500 , rewards =  [1. 0. 0. 0. 0.]
episode = 550 , rewards =  [0. 1. 0. 0. 0.]
episode = 600 , rewards =  [1. 0. 1. 1. 1.]
episode = 650 , rewards =  [0. 1. 1. 0. 0.]
episode = 700 , rewards =  [0. 1. 1. 1. 1.]
episode = 750 , rewards =  [0. 1. 1. 1. 1.]
episode = 800 , rewards =  [1. 1. 1. 1. 1.]
Time =  450
on recommence
episode = 850 , rewards =  [0. 1. 0. 1. 1.]
episode = 900 , rewards =  [1. 0. 1. 0. 1.]
episode = 950 , rewards =  [1. 1. 0. 1. 1.]
episode = 1000 , rewards =  [1. 1. 1. 1. 1.]
Time =  200
on recommence
episode = 1050 , r