In [40]:
import torch
from torch.nn import Sequential, Linear, ReLU, Softmax
import numpy as np
import json
import socket
from time import time
import os

In [61]:
"""
Simple Neural Net
Initially, input is features, output is action
"""
FEATURE_NUM = 14
ACTION_NUM = 10

class SimpleNet:
    def __init__(self):                 
        hidden_size = 64

        self.device = torch.device("cuda")

        self.neural_net = Sequential(
            # feature size by hidden size
            Linear(FEATURE_NUM, hidden_size),
            ReLU(),
            Linear(hidden_size, hidden_size),
            ReLU(),
            # hidden size by action size
            Linear(hidden_size, ACTION_NUM),
            Softmax(dim=-1)
        ).to(self.device)
        self.optimizer = torch.optim.Adam(self.neural_net.parameters(), lr=1e-3)
        self.gamma = 0.995
        self.num_backward = 0
        self.total_steps = 0

        # If weights already exist, load them
        if os.path.exists("pretrained_weights.pt"):
            self.neural_net.load_state_dict(torch.load("pretrained_weights.pt", map_location="cuda:0"))

        self.reset()

    def reset(self):
        self.ep_feats = []
        self.ep_rewards = []
        self.ep_actions = []

    def forward(self, input_feats):
        # Take as input a featurized state, output action probs
        probs = self.neural_net(input_feats.to(self.device))
        # Randomly select action according to probs
        return probs
    
    def get_action(self, input_feats):
        self.total_steps += 1
        probs = self.forward(input_feats)
        action = np.random.choice(probs.shape[0], p=probs.cpu().detach().numpy())
        # action = int(np.argmax(probs.cpu().detach().numpy())) # Deterministic
        self.ep_feats.append(input_feats)
        self.ep_actions.append(action)
        return action

    def backward(self):
        # Ignore if websocket returned for no reason
        self.num_backward += 1
        log_probs = []
        
        # Get log probs for features
        for feats in self.ep_feats[1:]:
            log_probs.append(torch.log(self.forward(feats)))

        # convert log probs and actions to tensors
        log_probs = torch.stack(log_probs).to(self.device)
        actions = torch.IntTensor(self.ep_actions).to(self.device)

        # Compute discounted returns
        disc_rets = torch.zeros(actions.shape[0])
        for i in range(1, len(self.ep_rewards)):
            disc_rets[i-1] = self.discounted_return(self.ep_rewards[i:])
        disc_rets = disc_rets.to(self.device)

        # Compute loss
        # Page 328: http://www.incompleteideas.net/book/RLbook2020.pdf
        sel_log_probs = disc_rets * log_probs[np.arange(len(self.ep_actions)), self.ep_actions]
        loss = -sel_log_probs.mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.reset()

        # Every 10 backward passes, save the model
        if self.num_backward % 5 == 0:
            print(f"Saving model. Num total steps={self.total_steps}. Num backward passes={self.num_backward}")
            start = time()
            torch.save(self.neural_net.state_dict(), "pretrained_weights.pt")
        return 0

    def discounted_return(self, rewards):
        return np.dot(self.gamma ** np.arange(len(rewards)), rewards)

In [62]:
# Handle json

def convert_to_feats(response_dict):
    """Takes a response dictionary and returns tensor of features"""
    return torch.Tensor(
        (
            response_dict['feat1'],
            response_dict['feat2'],
            response_dict['feat3'],
            response_dict['feat4'],
            response_dict['feat5'],
            response_dict['feat6'],
            response_dict['feat7'],
            response_dict['feat8'],
            response_dict['feat9'],
            response_dict['feat10'],
            response_dict['feat11'],
            response_dict['feat12'],
            response_dict['feat13'],
            response_dict['feat14'],
        )
    )

def handle_response(response, net: SimpleNet):
    """Take as input a websocket response from C#, return serialized data to send back"""
    # Initialize response
    instruction = ''
    action = 0
    if response is None:
        print("Response none")
        return {}
    response_dict = json.loads(response)

    # Store the reward for the previous action. 
    # Don't worry about the first one in the array, it will be ignored
    net.ep_rewards.append(response_dict['reward'])

    # Handle forward pass
    if response_dict['instruction'] == 'forward':
        features = convert_to_feats(response_dict)
        action = net.get_action(features)
        instruction = 'step'
    elif response_dict['instruction'] == 'update_weights':
        instruction = 'reset'
        last_feats = convert_to_feats(response_dict)
        net.ep_feats.append(last_feats)
        start = time()
        net.backward()
        print(f"Took {time() - start:.4f} seconds to compute backward pass")
        

    json_return_dict = {
        'instruction': instruction,
        'action': action,
    }
    
    return json_return_dict

In [60]:
host, port = "127.0.0.1", 25001

# SOCK_STREAM means TCP socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

neural_net = SimpleNet()
request = {
    'instruction': 'init',
    'action': 0,
}

def on_receive(message):
    print(message)

try:
    # Connect to the server and send the data
    sock.connect((host, port))

    start = 0

    while(True):
        #print(request)
        request = json.dumps(request)
        sock.sendall(request.encode("utf-8"))
        response = sock.recv(1024).decode("utf-8")
        #print(f"Took {time() - start:.4f} between, that's {1/(time() - start):.2f} Hz")
        start = time()
        request = handle_response(response, neural_net)
finally:
    sock.close()

{"instruction":"forward","reward":0,"feat1":0.0199999996,"feat2":1.27781999,"feat3":-0.49000001,"feat4":0,"feat5":0,"feat6":0,"feat7":0.0260000005,"feat8":0.569999993,"feat9":-0.00400000019,"feat10":0.704848647,"feat11":0,"feat12":0,"feat13":0,"feat14":0}
{"instruction":"forward","reward":-2,"feat1":0.0199999996,"feat2":1.27781999,"feat3":-0.49000001,"feat4":0,"feat5":0,"feat6":0,"feat7":0.0260000005,"feat8":0.569999993,"feat9":-0.00400000019,"feat10":0.704848647,"feat11":0,"feat12":0,"feat13":0,"feat14":0}
{"instruction":"forward","reward":-2,"feat1":0.0199999996,"feat2":1.27781999,"feat3":-0.49000001,"feat4":0,"feat5":0,"feat6":0,"feat7":0.0260000005,"feat8":0.569999993,"feat9":-0.00400000019,"feat10":0.704848647,"feat11":0,"feat12":0,"feat13":0,"feat14":0}
{"instruction":"forward","reward":-2,"feat1":0.0199999996,"feat2":1.27781999,"feat3":-0.49000001,"feat4":0,"feat5":0,"feat6":0,"feat7":0.0260000005,"feat8":0.569999993,"feat9":-0.00400000019,"feat10":0.704848647,"feat11":0,"feat12

JSONDecodeError: Expecting value: line 1 column 1 (char 0)