# 基本设置

## 变量编码

### state编码：

一维数组，大小是30，编号0-29

- 编号0-14。表示当前玩家的手牌情况，0-12代表1-13的个数，13、14分别代表小王、大王的个数。
- 编号15-29。表示当前所有玩家已出的手牌情况。具体编码同上。

例子：

玩家A的手牌是 [1 1 2 3]，玩家A,B,C已经出过的牌有[2,2,2,3,3,4]。这时候轮到A出牌了，A的state编码为：

[2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + [0, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### action编码：

一维数组，大小是403，编号0-402如下：

- 0-14: 单出， 1-13，小王，大王
- 15-27: 对，1-13
- 28-40: 三，1-13
- 41-196: 三带1，先遍历111.2，111.3，一直到131313.12
- 197-352: 三带2，先遍历111.22,111.33,一直到131313.1212
- 353-366: 炸弹，1111-13131313，加上王炸
- 367-402: 先考虑5个的顺子，按照顺子开头从小到达进行编码，共计8+7+..+1=36
- 403-415: 三带大王
- 416-428: 三带小王
    
### reward编码：

- 出完牌之后的state输的话，state为-1
- 出完牌之后的state赢的话，state为1
- 出完牌之后的state未结束的话，state为0

# 构建字典

In [1]:
h = {}

# 三带大王
for i in range(403,416):
    h[i] = str([i-402]*3 + [15])
    
# 三带小王
for i in range(416,429):
    h[i] = str([i-415]*3 + [14])

# 单
for i in range(15):
    h[i] = str([i+1])
assert i==14

# 对
for i in range(15,28):
    h[i] = str([i-14]*2)
assert i==27

# 三
for i in range(28,41):
    h[i] = str([i-27]*3)
assert i==40

# 三带1
l = []
for m in range(13):
    for n in range(13):
        if m!=n:
            curr = sorted([m+1]*3+[n+1])
            l.append(str(curr))
assert len(l)==13*12
for i in range(41,197):
    h[i] = l[i-41]
assert i==196

# 三带2
l = []
for m in range(13):
    for n in range(13):
        if m!=n:
            curr = sorted([m+1]*3+[n+1]*2)
            l.append(str(curr))
assert len(l)==13*12
for i in range(197,353):
    h[i] = l[i-197]
assert i==352

# 炸弹
353-366
for i in range(353,367):
    idx = i-352
    if idx<=13:
        h[i] = str([idx]*4)
    else:
        h[i] = str([14,15])
assert i==366

# 顺子
l = []
for size in range(8,0,-1):
    length = 13 - size
    curr = []
    for i in map(lambda x: range(x,x+length), range(3,3+size)):
        if i[-1]!=14:
            curr.append(str(i))
        else:
            curr.append(str([1]+i[:-1]))
    l = l + curr
for i in range(367,403):
    h[i] = l[i-367]
assert i==402

In [2]:
# 验证
assert len(h)==429

In [3]:
# 需要的~
h_xugang = {h[i]:i for i in h}

## 需要的API

In [4]:
from ddz_rl.agent import Agent

# lib

In [5]:
import tensorflow as tf
import numpy as np

# Q network

## 输入

- state_size大小: 30    13(1-4)+2(01值)
- action大小：403（01所有可能情况:单(15)、对(13)、三(13)、三带1(13*12)、三带(13*12)、炸弹(13)、顺子5+(sum(range(2,11)))）
- reward：当前的reward，最后的reward


1. inputs_: [None, state_size]
2. actions_: [None]
3. targetQs_: [None]

In [6]:
class QNetwork:
    def __init__(self, learning_rate=0.01, state_size=30, 
                 action_size=429, hidden_size=20, 
                 name='QNetwork'):
        # state inputs to the Q-network
        with tf.variable_scope(name):
            # state
            self.inputs_ = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            # action
            self.actions_ = tf.placeholder(tf.int32, [None], name='actions')
            one_hot_actions = tf.one_hot(self.actions_, action_size)
            
            # Target Q values for training
            self.targetQs_ = tf.placeholder(tf.float32, [None], name='target')
            
            # ReLU hidden layers
            self.fc1 = tf.contrib.layers.fully_connected(self.inputs_, hidden_size, 
                activation_fn = tf.nn.relu, ) # [batch, hidden_size]
            self.fc2 = tf.contrib.layers.fully_connected(self.fc1, hidden_size) # [batch, hidden_size]

            # Linear output layer
            self.output = tf.contrib.layers.fully_connected(self.fc2, action_size, 
                                                            activation_fn=None)
            
            ### Train with loss (targetQ - Q)^2
            # Q value from Q-network
            self.Q = tf.reduce_sum(tf.multiply(self.output, one_hot_actions), axis=1)
            self.loss = tf.reduce_mean(tf.square(self.targetQs_ - self.Q))
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

# Experience Replay

In [7]:
from collections import deque
class Memory():
    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]

# Hyperparameters

In [8]:
train_episodes = 1000          # max number of episodes to learn from
max_steps = 200                # max steps in an episode
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 64               # number of units in each Q-network hidden layer
learning_rate = 0.0001         # Q-network learning rate

# Memory parameters
memory_size = 10000            # memory capacity
batch_size = 100                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

In [9]:
tf.reset_default_graph()
mainQN = QNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate)

# Populate the experience memory

看这里需要什么

1. 可选的action空间
2. 选择action后获得state、reward、done

In [10]:
import random
def random_action(agent):
    action_space = agent.get_actions_space()
    
    if action_space:
        yaobuqi = False
        action_id = random.randint(0,len(action_space)-1)
        action = action_space[action_id]
#         try:
        state, reward, done = agent.step(action_id=action_id)
#         except:
#             print action_space,action_id
#             raise Exception("Wrong")
    else:
        yaobuqi = True
        state, reward, done = agent.step()
    return state, reward, done, action, yaobuqi

# 游戏开始，洗牌，发牌
agent = Agent()
state = agent.reset()

# 初始化memory
memory = Memory(max_size=memory_size)
for ii in range(pretrain_length):

    # 随机决策
    next_state, reward, done, action, yaobuqi = random_action(agent)

        
    
    # 要不起的情况下不添加训练数据集
    if yaobuqi:
        state = next_state
        continue
        
    if done:
        next_state = np.zeros(state.shape)
        memory.add((state, action, reward, next_state))
        
        # 重新洗牌，发牌
        state = agent.reset()
    else:
        # 添加memory
        memory.add((state, action, reward, next_state))
        state = next_state

# traing

In [None]:
agent = Agent()
s = agent.reset()
done = False
while(not done):
    #print agent.game.get_record().cards_left1
    actions = agent.get_actions_space() #如果actions为[]，step()
    #GY的RL程序
    s_, r, done = agent.step(action_id=0)
    #print agent.game.get_record().cards_left1
    #print agent.game.get_record().cards_left2
    #print agent.game.get_record().cards_left3
    #print agent.game.get_record().records
    #print "===================="        
    #raw_input("")
    s = s_


In [11]:
def random_action(agent):
    action_space = agent.get_actions_space()
    
    if action_space:
        yaobuqi = False
        action_id = random.randint(0,len(action_space)-1)
        action = action_space[action_id]
#         try:
        state, reward, done = agent.step(action_id=action_id)
#         except:
#             print action_space,action_id
#             raise Exception("Wrong")
    else:
        yaobuqi = True
        state, reward, done = agent.step()
    return state, reward, done, action, yaobuqi

In [12]:
done = False

In [13]:
# Now train with experiences
saver = tf.train.Saver()
rewards_list = []

agent.reset()

with tf.Session() as sess:
    # Initialize variables
    sess.run(tf.global_variables_initializer())
    
    step = 0
    for ep in range(1, train_episodes):
        total_reward = 0
        t = 0
        while not done: # ！ 这里改成当游戏还未结束
            
            # 指数衰减的随机探索
            step += 1
            explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step) 
            if explore_p > np.random.rand():
                # 随机探索
                state, reward, done, action, yaobuqi = random_action(agent)
            else:
                # 根据Q-network行动
                feed = {mainQN.inputs_: state.reshape((1, np.product(state.shape)))}
                Qs = sess.run(mainQN.output, feed_dict=feed)
                print Qs
                raise Exception
                action = np.argmax(Qs)
                next_state, reward, done, _ = (action)
    
            total_reward += reward
            
            if done:
                # the episode ends so no next state
                next_state = np.zeros(state.shape)
                t = max_steps
                
                print('Episode: {}'.format(ep),
                      'Total reward: {}'.format(total_reward),
                      'Training loss: {:.4f}'.format(loss),
                      'Explore P: {:.4f}'.format(explore_p))
                rewards_list.append((ep, total_reward))
                
                # Add experience to memory
                memory.add((state, action, reward, next_state))
                
                # Start new episode
                env.reset()
                # Take one random step to get the pole and cart moving
                state, reward, done, _ = env.step(env.action_space.sample())

            else:
                # Add experience to memory
                memory.add((state, action, reward, next_state))
                state = next_state
                t += 1
            
            # Sample mini-batch from memory
            batch = memory.sample(batch_size)
            states = np.array([each[0] for each in batch])
            actions = np.array([each[1] for each in batch])
            rewards = np.array([each[2] for each in batch])
            next_states = np.array([each[3] for each in batch])
            
            # Train network
            target_Qs = sess.run(mainQN.output, feed_dict={mainQN.inputs_: next_states})
            
            # Set target_Qs to 0 for states where episode ends
            episode_ends = (next_states == np.zeros(states[0].shape)).all(axis=1)
            target_Qs[episode_ends] = (0, 0)
            
            targets = rewards + gamma * np.max(target_Qs, axis=1)

            loss, _ = sess.run([mainQN.loss, mainQN.opt],
                                feed_dict={mainQN.inputs_: states,
                                           mainQN.targetQs_: targets,
                                           mainQN.actions_: actions})
        
    saver.save(sess, "checkpoints/cartpole.ckpt")

ValueError: shape mismatch: value array of shape (2,) could not be broadcast to indexing result of shape (6,429)