In [44]:
from environments import MountainCar
from features import LinearFeatureMap
import numpy as np
from tqdm.notebook import tqdm
from sklearn.linear_model import LogisticRegression

In [2]:
#tuples = np.load('mc_data.npy',allow_pickle = True)

In [58]:
H = 150
num_trials = 10000
env = MountainCar(H)

s = np.zeros((2,num_trials))
s[0,:] = np.random.uniform(low = -1.2,high = 0.1,size = num_trials)
env.reset()
tuples = []

for h in tqdm(range(H)):
    a = np.random.choice([-1,0,1],size=num_trials)
    cost, s_ = env.step_broadcast(s, a, num_trials)
    tuples.append([s,a+1,cost,s_,h])
    s = s_


  0%|          | 0/150 [00:00<?, ?it/s]

In [59]:
class FittedQIteration(object):
    def __init__(self,phi,features,data,horizon, num_trials,k):
        self.phi = phi
        self.features = features
        self.data = data
        self.H = horizon
        self.n = num_trials
        self.k = k
        self.d = len(self.get_phi(np.array([0,0])))
        self.theta_ = np.zeros((self.H,3,self.d)) 
        self.theta = np.zeros((self.H,3,self.d))
        self.A = np.zeros((self.H, 3, self.n, self.d))
        #self.get_A()
        
    
    def get_phi(self,state):
        if self.features == 'poly':
            return self.phi.polynomial_basis(state)
        elif self.features == 'fourier':
            phi = self.phi.fourier_basis(state)
            return phi[:,0]
        
    
    def get_A(self):
        print('Getting A')
        data = self.data.copy()
        for h in tqdm(range(self.H - 1, -1, -1)):
            s,a = data[h][0], data[h][1]
            for i in range(len(a)):
                phi = self.get_phi(s[:,i])
                self.A[h,int(a[i]),i] = phi
    
    
    def get_targets(self):
        data = self.data.copy()
        self.tar = np.zeros((self.H,3,self.n))
        for h in range(self.H - 1, -1, -1):
            a, c, s_ = data[h][1], data[h][2], data[h][3]
            for i in range(len(a)):
                if h != self.H - 1:
                    phi_ = self.get_phi(s_[:,i])
                    q = np.zeros(3)
                    for a_ in range(3):
                        inner = np.inner(phi_, self.theta_[h+1,a_])
                        q[a_] = 1 / (1 + np.exp(-inner))
                    v = np.min(q)
                    self.tar[h,int(a[i]),i] = np.random.binomial(1,p=max(0,min((c[i] + v),1))) 
                else:
                    self.tar[h,int(a[i]),i] = c[i]
        
     
    def minimize_log(self):
        for h in tqdm(range(self.H-1,-1,-1)):
            for a in range(3):
                self.clf = LogisticRegression(tol = pow(10,-8), random_state = 0).fit(self.A[h,a], self.tar[h,a])
                self.theta[h,a] = np.array(self.clf.coef_)
            
    
    def update_Q(self):
        self.get_targets()
        self.minimize_log()
        self.theta_ = self.theta
        
    
    
    def run(self):
        self.get_A()
        print('Running')
        for t in tqdm(range(self.k)):
            self.update_Q()
            q = np.zeros(3)
            s = np.array([0.5,0])
            phi = self.get_phi(s)
            for a in range(3):
                q[a] = 1 / (1 + np.exp(-np.inner(phi,self.theta[0,a])))
            print(q)
        

In [60]:
for i in [1000,5000,10000,20000,40000,80000,100000,1000000]:
    phi = LinearFeatureMap()
    #phi.init_poly_features(2,1)
    phi.init_fourier_features(2,4)
    phi.init_state_normalizers(np.array([0.6,0.07]),np.array([-1.2,-0.07]))
    features = 'fourier'
    print(i)
    agent = FittedQIteration(phi,features,tuples,H,num_trials,10)
    agent.run()

1000
Getting A


  0%|          | 0/150 [00:00<?, ?it/s]

Running


  0%|          | 0/10 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [37]:
costs = tuples[-1][2]

In [38]:
x = np.where(costs==0.0)

In [39]:
len(x[0])/num_trials

0.000443

In [15]:
s = tuples[0][0]

In [17]:
s[:,0]

array([-0.56825952,  0.        ])

In [28]:
agent.A

array([[[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]],


       [[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]],


       [[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0.