In [None]:
# pipeline for playing any game with some fixed format
# deep reinforcement learning with tensorflow/keras
# 

In [1]:
import gym
import numpy as np
import tensorflow as tf

import random

In [2]:
class Game:
    
    def __init__(self,env,observation_space,action_space):
        
        # environment parameters
        self.env = env
        self.observation_space = observation_space
        self.action_space = action_space
        
        # q-learning parameters
        self.gamma = 0.4
        self.epsilon = 1
        self.epsilon_decay = 0.01
        self.min_epsilon = 0.1
        self.memory = []
        
        # input is equal to observable variables from environment
        self.input_neuron = self.observation_space
        # hidden neurons are self defined
        self.hidden_neuron = 10
        # output neurons is equal to number of actions available
        self.output_neuron = self.action_space
        
        # placeholders that are feeded with actual data in the session
        self.x = tf.placeholder(tf.float32,[None,self.input_neuron])
        self.y = tf.placeholder(tf.float32,[None,self.output_neuron])
        
        # getting the model instance
        self.model = self.graph()
        self.sess = tf.InteractiveSession()
        
        # defining the loss, train and predic operations
        self.loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(
                             logits=self.model,labels=self.y))
        self.training = tf.train.AdamOptimizer(0.1).minimize(self.loss)
        self.prediction = self.model[0]
        
        # starting the session globally
        self.sess.run(tf.global_variables_initializer())
        
    
    @staticmethod
    def one_hot_encode(y):
        y_ = np.zeros((len(y),2))
        for i in range(len(y)):
            y_[i,y[i][0]]=1
        
        return y_
        
    def graph(self):
        # three layer graph
        # input -> hidden -> output
        w1=tf.Variable(tf.random_normal([self.input_neuron,self.hidden_neuron]))
        l1=tf.nn.relu(tf.matmul(self.x,w1))
        w2=tf.Variable(tf.random_normal([self.hidden_neuron,self.output_neuron]))
        l2= tf.matmul(l1,w2) # last layer as linear unit
        
        return l2
        
    
    def action(self,state):
        if(np.random.randn() < self.epsilon):
            return self.env.action_space.sample()
        else:
            next_action = np.argmax(self.sess.run(self.prediction,feed_dict={self.x:state}))
            return next_action
    
    def remember(self,state,action,next_state,reward,done):
        self.memory.append([state,action,next_state,reward,done])

    def get_memory_size(self):
        return len(self.memory)
    
    def replay(self,batchsize):
        dataset = random.sample(self.memory,batchsize)
        # training the model
        for state,action,next_state,reward,done in dataset:
            target = reward
            if not done:
                target = reward + self.gamma*np.max(self.sess.run(
                                        self.prediction,feed_dict={self.x:next_state}))
            target_f = self.sess.run(self.prediction,feed_dict={self.x:state})
            target_f[action] = reward
            target_f = np.reshape(target_f,[-1,2])
            self.sess.run(self.training,feed_dict={self.x:state,self.y:target_f})
            
            
        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.epsilon_decay

In [4]:
env = gym.make('CartPole-v0')
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
game = Game(env,observation_space,action_space)

total_steps = 0
for epoch in range(100):
    
    
    # print('Game : {}'.format(epoch))
    state = env.reset()
    state = np.reshape(state,[1,4])
    
    steps = 0
    # for steps in range(200):
    while steps != 200:
        env.render()
        action = game.action(state)
        next_state,reward,done,info = env.step(action)
        next_state = np.reshape(next_state,[1,4])
        
        # saving the data in the memory
        game.remember(state,action,next_state,reward,done)
        
        state = next_state
        if(done):
            if(steps == 200):
                print("Game won after {} trials".format(epoch))
            total_steps+=steps
            break
        steps += 1
            
    if(game.get_memory_size()>30):
        game.replay(30)
print("Average no. of steps : {}".format(total_steps/100))
    

[2018-04-26 23:22:59,001] Making new env: CartPole-v0


KeyboardInterrupt: 

In [5]:
env.close()