In [None]:
import os
import os.path
import sys
import math

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import keras
from keras import layers

In [None]:
sys.path.append(os.path.abspath('../dependencies/BristolStockExchange'))
sys.path.append(os.path.abspath('../src'))

# System setup

Model characteristics:
* Continuous inputs, discrete actions
* Discrete events in continuous time

### Flappy Bird

* State: (hight, lower border of the hole, upper border of the hole, time to hole)
* Actions: {fly up, keep falling}
* Reward: time survived

In [None]:
from reinforcement_example.game import Game

In [None]:
# Play the game
game = Game()
while True:
    print(game.get_state())
    action = int(input())
    game.time_step(action)
    if not game.alive:
        print("DEAD!")
        break

In [None]:
from reinforcement_example.learner import make_model

In [None]:
from reinforcement_example.learner import AbstractLearner

In [None]:
class ScaledModel(object):
    def __init__(self, model, xscale=None, yscale=None):
        self.model = model
        self.xscale = xscale
        self.yscale = yscale
    
    def fit(self, X, Y):
        rX = X / self.xscale[np.newaxis, :] if self.xscale is not None else X
        rY = Y / self.yscale if self.yscale is not None else Y
        self.model.fit(rX, rY)
        
    def predict(self, X):
        rX = X / self.xscale[np.newaxis, :] if self.xscale is not None else X
        res = self.model.predict(rX)
        return res * self.yscale if self.yscale is not None else res

In [None]:
def logodds_to_probs(odds):
    scaled = np.asarray(odds) - np.max(odds)
    transformed = np.exp(scaled)
    return transformed / np.sum(transformed)

In [None]:
class ValueToPolicy(object):
    def __init__(self, value_model, scale=1.0):
        self.value_model = value_model
        self.num_actions = len(value_model)
        self.scale = scale
        
    def predict(self, states):
        res = np.array([model.predict(states)[:, 0] for model in self.value_model]) / self.scale
        # res has axes: action, batch
        # Result must have axes: batch, action
        return res.T
    
    def set_scale(self, scale):
        self.scale = scale

In [None]:
# class SoftPolicy(object):
#     def __init__(self, base, prob_random):
#         self.base = base
#         self.p = prob_random
        
#     def predict(self, states):
#         res = self.base.predict(states)
#         num_actions = res.shape[1]
#         uniform = np.ones(num_actions) / num_actions
#         res = self.p * uniform + (1 - self.p) * res
#         return res

In [None]:
def random_round(a):
    shape = a.shape if hasattr(a, 'shape') else (1,)
    flr = np.floor(a).astype(np.int64, copy=False)
    return flr + (np.random.rand(*shape) < a - flr)

In [None]:
def weights_to_inds(weights, amplification=10):
    copy_counts = random_round(weights * amplification)
    res = np.empty(np.sum(copy_counts), dtype=np.int64)
    ptr = 0
    for i, cnt in enumerate(copy_counts):
        res[ptr:ptr+cnt] = i
        ptr += cnt
    return res

In [None]:
class MCLearner(AbstractLearner):
    def __init__(self, state_dim, num_actions, reward_scale=1.0):
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.reward_scale = reward_scale
        self.value_model = [ScaledModel(make_model(state_dim, 1), yscale=reward_scale) for i in range(num_actions)]
        #self.policy = make_model(state_dim, num_actions)
        self.policy = ValueToPolicy(self.value_model, scale=self.reward_scale)
        self.histories = []
        self.history = []
    
    def move(self, state):
        probs = logodds_to_probs(self.policy.predict(state[np.newaxis, :])[0])
        choice = np.searchsorted(np.cumsum(probs), np.random.rand())
        self.last_action_prob = probs[choice]
        return choice
    
    def learn(self, state, action, reward, next_state, value_proxy):
        self.history.append((np.array(state), action, reward, self.last_action_prob))
        
    def learn_last(self, state, action, reward):
        self.history.append((np.array(state), action, reward, self.last_action_prob))
        self.histories.append(self.history)
        self.history = []
        self._update_value_model()
        
    def _update_value_model(self):
        trainX = [[] for action in range(self.num_actions)]
        trainY = [[] for action in range(self.num_actions)]
        trainW = [[] for action in range(self.num_actions)]
        for history in self.histories:
            cumreward = 0.0
            cumprob = 1.0
            for state, action, reward, action_prob in history[::-1]:
                cumreward += reward
                probs = logodds_to_probs(self.policy.predict(state[np.newaxis, :])[0])
                trainX[action].append(state)
                trainY[action].append(cumreward)
                trainW[action].append(cumprob)
                cumprob *= probs[action] / action_prob
        for action in range(self.num_actions):
            X = np.array(trainX[action])
            Y = np.array(trainY[action])
            W = np.array(trainW[action])
            ### TODO: weights_to_inds
            inds = weights_to_inds(W)
            #old = self.value_model[action].predict(X)[:, 0]
            #print((X.shape, Y.shape, W.shape, old.shape))
            #train_target = old * (1 - W) + Y * W
            #self.value_model[action].fit(X, train_target)
            self.value_model[action].fit(X[inds], Y[inds])
        self.policy.set_scale(self.reward_scale / len(self.histories))

In [None]:
from reinforcement_example.learner import train_play

In [None]:
game = Game()
learner = MCLearner(4, 2, reward_scale=game.mean_time_to_hole)

In [None]:
train_play(Game(), learner)[1]

In [None]:
history = []

In [None]:
for i in range(10000):
    history.append(train_play(Game(), learner)[1])
plt.plot(np.cumsum(history))

In [None]:
for i in range(30):
    history.append(train_play(Game(), learner)[1])
plt.plot(np.cumsum(history))

In [None]:
np.mean(history)

In [None]:
import pickle
with open('002_reinforcement_example_v2_learner.pickle', 'wb') as f:
    pickle.dump({
        'histories': learner.histories,
        'value_model': [
            {
                'config': model.get_config(),
                'weights': model.get_weights()
            }
            for model in learner.value_model
        ]
    }, f)

In [None]:
foo = np.zeros((100, 4))
for i in range(foo.shape[0]):
    foo[i, :4] = Game().get_state()
fooy0 = learner.value_model[0].predict(foo).reshape(-1)
fooy1 = learner.value_model[1].predict(foo).reshape(-1)
plt.plot(foo[:, 3], fooy0, '.')
plt.plot(foo[:, 3], fooy1, '.')
plt.grid()

In [None]:
foo_history = history

try:
        trainX = [[] for action in range(learner.num_actions)]
        trainY = [[] for action in range(learner.num_actions)]
        trainW = [[] for action in range(learner.num_actions)]
        for history in learner.histories:
            cumreward = 0.0
            cumprob = 1.0
            for state, action, reward, action_prob in history[::-1]:
                cumreward += reward
                probs = logodds_to_probs(learner.policy.predict(state[np.newaxis, :])[0])
                trainX[action].append(state)
                trainY[action].append(cumreward)
                trainW[action].append(cumprob)
                cumprob *= probs[action] / action_prob
        for action in range(learner.num_actions):
            X = np.array(trainX[action])
            Y = np.array(trainY[action])
            W = np.array(trainW[action])
            inds = weights_to_inds(W)
            plt.plot(X[:, 3], Y, '.')
        for action in range(learner.num_actions):
            X = np.array(trainX[action])
            Y = np.array(trainY[action])
            W = np.array(trainW[action])
            inds = weights_to_inds(W)
            plt.figure()
            plt.plot(learner.value_model[action].predict(X), Y, '.')
            #plt.title('Prediction vs response, RMS = {}'.format(learner.value_model[action].evaluate(X[inds], Y[inds])))
            plt.grid(True)
finally:
    history = foo_history

In [None]:
learner.policy.scale

In [None]:
[np.mean(W) for W in trainW]

#### Test the ability of the neural network to learn...

In [None]:
model = make_model(6, 1)

In [None]:
data = np.random.rand(6000).reshape((1000, 6)) * 3

In [None]:
for row in data:
    model.train_on_batch(row[np.newaxis, :], row[[-1]])

In [None]:
foo = np.random.rand(600).reshape((100, 6)) * 3
fooy = model.predict(foo)
plt.plot(foo[:, -1], fooy.reshape(-1), '.')
plt.grid(True)