In [1]:
from environments import MountainCar
from features import LinearFeatureMap
from classifiers import Sigmoid_Regression
import numpy as np
from tqdm.notebook import tqdm
from sklearn.linear_model import LogisticRegression
import itertools as iters
from joblib import Parallel, delayed

In [2]:
#tuples = np.load('mc_data.npy',allow_pickle = True)

In [3]:
H = 150
num_trials = 100000
env = MountainCar(H)

s = np.zeros((2,num_trials))
s[0,:] = np.random.uniform(low = -1.2,high = 0.1,size = num_trials)
env.reset()
tuples = []

for h in tqdm(range(H)):
    a = np.random.choice([-1,0,1],size=num_trials)
    cost, s_ = env.step_broadcast(s, a, num_trials)
    tuples.append([s.T,a+1,cost,np.array(s_).T,h])
    s = s_

  0%|          | 0/150 [00:00<?, ?it/s]

In [11]:
class FittedQIteration(object):
    def __init__(self,phi,features,data,horizon,num_trials,k):
        self.phi = phi
        self.features = features
        self.data = data
        self.H = horizon
        self.n = num_trials
        self.k = k
        self.d = len(self.phi.order_list)
        self.theta_ = np.zeros((self.H, 3*self.d)) 
        self.theta = np.zeros((self.H, 3*self.d)) 
        #self.A = np.zeros((self.H, 3, self.n, self.d))
        self.A = {}
        self.get_A()
        #self.get_targets()
        
    
    def get_phi(self,state):
        if self.features == 'poly':
            return self.phi.polynomial_basis(state)
        elif self.features == 'fourier':
            phi = self.phi.fourier_basis(state)
            return phi

    def sigmoid(self,x):
        x[x < -36] = -36
        x[x > 36] = 36
        return 1 / (1 + np.exp(-x))


    def get_psi(self,phi,a):
        v = len(phi)
        psi = np.zeros((v,3*self.d))
        lower = 3 * a
        psi[lower : lower + self.d,:] = phi
        return psi
        
        
    
    def get_A(self):
        print('Getting A')
        data = self.data.copy()
        for h in tqdm(range(self.H)):
            s,a = data[h][0], data[h][1]
            a0 = np.where(a==0)
            a1 = np.where(a==1)
            a2 = np.where(a==2)
            self.x = self.get_phi(s)
            self.psi = self.get_psi(self.x,a1)
            self.A[h,0] = self.x[a0]
            self.A[h,1] = self.x[a1]
            self.A[h,2] = self.x[a2]
    
    
    def get_targets(self):
        data = self.data.copy()
        self.tar = {}
        for h in (range(self.H - 1, -1, -1)):
            a, c, s_ = data[h][1], data[h][2], data[h][3]
            a0 = np.where(a==0)
            a1 = np.where(a==1)
            a2 = np.where(a==2)
            if h != self.H - 1:
                phi_ = self.get_phi(s_)
                q = np.zeros((len(a),3))
                inner0 = self.sigmoid(np.dot(phi_,self.theta[h+1,0]))
                inner1 = self.sigmoid(np.dot(phi_,self.theta[h+1,1]))
                inner2 = self.sigmoid(np.dot(phi_,self.theta[h+1,2]))
                v = np.minimum(inner0,inner1,inner2)
                self.tar[h,0] = c[a0] + v[a0]
                self.tar[h,1] = c[a1] + v[a1]
                self.tar[h,2] = c[a2] + v[a2]
            else:
                self.tar[h,0] = c[a0] 
                self.tar[h,1] = c[a1] 
                self.tar[h,2] = c[a2] 
            '''
            for i in range(len(a)):
                if h != self.H - 1:
                    phi_ = self.get_phi(s_[:,i])
                    q = np.zeros(3)
                    for a_ in range(3):
                        inner = np.inner(phi_, self.theta_[h+1,a_])
                        q[a_] = 1 / (1 + np.exp(-inner))
                    v = np.min(q)
                    self.tar[h,int(a[i]),i] = np.random.binomial(1,p=max(0,min((c[i] + v),1))) 
                else:
                    self.tar[h,int(a[i]),i] = c[i]
        '''
    def solve_LS(self):
        #Solve nonlinear least squares using scipy black-box (since sq loss is non-convex wrt to theta)
        #A = np.zeros((self.d,self.d))
        #b = np.zeros(self.d)
        #for i in range(self.n):
        #    x, y = self.features[i], self.obs[i]
        #    A = A + np.outer(x, x)
        #    b = b + y * x
        #self.theta_ols = np.linalg.solve(A,b)
        self.sol = least_squares(self.func, self.theta_ls, args=(self.feature,self.obs))


    def minimize_sq(self):
        for h in tqdm(range(self.H)):
            for a in range(3):
                self.feature = self.A[h,a]
                self.obs = self.tar[h,a]
                self.solve_LS()
                self.theta[h,a] = self.sol.x
                #self.theta[h,a] = self.reg.theta_ls

    
    def minimize_log(self):
        for h in tqdm(range(self.H-1,-1,-1)):
            for a in range(3):
                self.clf = LogisticRegression(tol = pow(10,-8), random_state = 0).fit(self.A[h,a], self.tar[h,a])
                self.theta[h,a] = np.array(self.clf.coef_)

    def minimize_sq(self):
        for h in tqdm(range(self.H)):
            for a in range(3):
                feature = self.A[h,a]
                obs = self.tar[h,a]
                self.reg = Sigmoid_Regression(feature, obs, self.d, np.zeros(self.d), lr_tol = pow(10,-8))
                self.reg.solve_LS()
                #print(self.reg.theta_ls)
                self.theta[h,a] = self.reg.theta_ls
                
            
    
    def update_Q(self):
        self.get_targets()
        self.minimize_sq()
        self.theta_ = self.theta
        
    
    
    def run(self):
        #self.get_A()
        print('Running')
        for t in tqdm(range(self.k)):
            self.update_Q()
        

In [12]:
phi = LinearFeatureMap()
#phi.init_poly_features(2,1)
phi.init_fourier_features(2,2)
phi.init_state_normalizers(np.array([0.6,0.07]),np.array([-1.2,-0.07]))
features = 'fourier'
agent = FittedQIteration(phi,features,tuples,H,num_trials,160)

Getting A


  0%|          | 0/150 [00:00<?, ?it/s]

TypeError: can only concatenate tuple (not "int") to tuple

In [None]:
agent.run()

Running


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]