In [1]:
import numpy as np
import json
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

### Prepare Data

In [2]:
with open("value_function.json", 'r') as f:
    value_function = json.load(f)

In [3]:
vf = pd.Series(value_function, name='value')
vf.head(10)

S-------O    0.426296
-S------O    0.382835
--S-----O    0.413625
---S----O    0.440745
-------SO    0.471797
----S---O    0.500000
-----S--O    0.497570
------S-O    0.309953
S---O--SO    0.101673
-S--O--SO    0.000000
Name: value, dtype: float64

In [4]:
vf.index[0]

'S-------O'

In [5]:
def convert_key(key):
    return np.stack((
        (np.array(tuple(key)) == 'S').reshape(3, 3), 
        (np.array(tuple(key)) == 'O').reshape(3, 3)
    ), axis=0)

In [6]:
x = convert_key('S-------O')
x.shape

(2, 3, 3)

In [7]:
x

array([[[ True, False, False],
        [False, False, False],
        [False, False, False]],

       [[False, False, False],
        [False, False, False],
        [False, False,  True]]])

In [8]:
vf.index.map(convert_key)[0]

array([[[ True, False, False],
        [False, False, False],
        [False, False, False]],

       [[False, False, False],
        [False, False, False],
        [False, False,  True]]])

In [9]:
X = np.stack(vf.index.map(convert_key)).astype(np.float32)
X.shape

(5442, 2, 3, 3)

In [10]:
y = vf.values.astype(np.float32)
y.shape

(5442,)

In [11]:
X[0], y[0]

(array([[[1., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 1.]]], dtype=float32), 0.42629614)

## First try regular 'unshaped' data

In [12]:
Xd = X.reshape((-1,18))
Xd.shape

(5442, 18)

In [13]:
Xd[0:5], vf.index[0:5]

(array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1.]], dtype=float32),
 Index(['S-------O', '-S------O', '--S-----O', '---S----O', '-------SO'], dtype='object'))

In [14]:
class args:
    
    seed = 0
    no_cuda = False
    
torch.manual_seed(args.seed)
use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [112]:
class Policy(nn.Module):
    
    def __init__(self, dim=[18, 200, 1], act_funcs=[F.relu, F.relu]):
        
        super(Policy, self).__init__()
        
        self.dim = dim
        self.layers = []
        self.act_funcs = act_funcs
        for i in range(1, len(dim)):
            self.layers.append(nn.Linear(dim[i-1], dim[i]))

    def forward(self, x):
        
        for layer, act_func in zip(self.layers, self.act_funcs):
            x = self.act_func(layer(x))
        y_pred = x
        return y_pred
    
    #def __repr__(self):
    #    
    #    return "Policy(dim=%s)" % (self.dim.__repr__())


class SimplePolicy(nn.Module):
    
    def __init__(self):
        super(SimplePolicy, self).__init__()
        self.affine1 = nn.Linear(18, 1024)
        self.affine2 = nn.Linear(1024, 1)

    def forward(self, x):
        x = F.relu(self.affine1(x))
        y_pred = self.affine2(x)
        return y_pred


In [113]:
#model = Policy()
model = SimplePolicy()

In [114]:
model

SimplePolicy(
  (affine1): Linear(in_features=18, out_features=1024, bias=True)
  (affine2): Linear(in_features=1024, out_features=1, bias=True)
)

In [115]:
m = len(Xd)
split = int(0.8*m)
m, split

(5442, 4353)

In [116]:
Xd_train, Xd_test = Xd[0:split], Xd[split:]
y_train, y_test = y[0:split].reshape((-1, 1)), y[split:].reshape((-1, 1))

Xd_train.shape, y_train.shape

((4353, 18), (4353, 1))

In [117]:
def train(args, model, device, data, target, criterion, optimizer):
    model.train()
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    
    if i % 100 == 99:
        print('loss:', loss.item())

In [118]:
def test(args, model, device, data, target, criterion):
    model.eval()
    with torch.no_grad():
        data, target = data.to(device), target.to(device)
        output = model(data)
        test_loss = criterion(output, target)
    
    return test_loss

In [119]:
model = SimplePolicy()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
eps = np.finfo(np.float32).eps.item()

In [128]:
input_data = torch.from_numpy(Xd_train)
target = torch.from_numpy(y_train)
print("Train")
for i in range(500):
    train(args, model, device, input_data, target, criterion, optimizer)

Train
loss: 0.0045851063914597034
loss: 0.00250298622995615
loss: 0.0018828449537977576
loss: 0.0015443058218806982
loss: 0.001320302370004356


In [129]:
input_data = torch.from_numpy(Xd_test)
target = torch.from_numpy(y_test)
test_loss = test(args, model, device, input_data, target, criterion)
print('Test\nloss:', float(test_loss))

Test
loss: 0.06868447363376617


In [130]:
input_data = torch.from_numpy(Xd_train)
list(zip(model(input_data[:10]).detach().numpy().tolist(), target[:10].detach().numpy().tolist()))

[([0.4124775230884552], [0.6355000138282776]),
 ([0.3514310121536255], [0.0]),
 ([0.400850385427475], [1.0]),
 ([0.4477371275424957], [0.0]),
 ([0.4631620943546295], [0.0]),
 ([0.4977399408817291], [1.0]),
 ([0.49888715147972107], [1.0]),
 ([0.28567180037498474], [0.6355000138282776]),
 ([0.11942158639431], [0.2922925353050232]),
 ([-0.0009653307497501373], [0.5])]