In [9]:
from __future__ import print_function
from __future__ import division
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import pandas as pd
import seaborn as sns

style.use('ggplot')
%matplotlib inline

In [10]:
try:
    import gym
except:
    !pip install gym
    import gym
    
isFast = True

In [11]:
import numpy
import math 
import os 
import random
from datetime import datetime
random.seed(datetime.now())

import cntk as C

In [12]:
# Select the right target device when this notebook is being tested:
if 'TEST_DEVICE' in os.environ:
    if os.environ['TEST_DEVICE'] == 'cpu':
        C.device.try_set_default_device(C.device.cpu())
    else:
        C.device.try_set_default_device(C.device.gpu(0))

In [13]:
## environment and setting

STATE_COUNT = 1
ACTION_COUNT = 4

# change these config later

# class Environment:
# 

env = gym.make('CartPole-v0')

STATE_COUNT  = env.observation_space.shape[0]
ACTION_COUNT = env.action_space.n

STATE_COUNT, ACTION_COUNT

[2017-08-02 18:16:47,860] Making new env: CartPole-v0


(4, 2)

In [14]:
class CardSet:
    def __init__(self):
        self.cards = []
        self.reset()
        
    def reset(self):
        self.cards = []
        for i in range(144):
            self.cards.append( self.cardIDcompress(i) )
        self.shuffle() 
        
    def cardIDcompress(self, idx):
        if idx < 0:
            print ("id should be larger than or equal to 0")
        if idx < 136:
            return int(idx/4)
        elif idx < 144:
            return int(idx-136 + 136/4)
        else:
            print ("id should be less than 144")
        return -1 # error
    
    def draw(self, direction="front"):
        if self.canDraw() == False:
            print("can't draw anymore")
            return -1
            
        if direction == "front":    
            card = self.cards[0]
            self.cards = self.cards[1:]
        elif direction == "back":
            card = self.cards[len(self.cards)-1]
            self.cards = self.cards[:-1]
        else:
            print("direction should be (front) or (back)")
            return -1
            
        return card
    
    def canDraw(self):
        if self.cardRemain() <= 0:
            return False
        return True
    
    def cardRemain(self):
        return len(self.cards)
    
    def shuffle(self):
        cardN = self.cardRemain()
        for i in range(cardN):
            idx = random.randint(i, cardN-1)
            temp = self.cards[i]
            self.cards[i] = self.cards[idx]
            self.cards[idx] = temp

# cardSet = CardSet()
# print(cardSet.cards)

In [15]:
# M1~M9, C1~C9, L1~L9, EW,WW,SW,NW, R,G,W, F1~8
# 9,    18,    27    24, 

def cardID2Str(idx):
    if idx < 0:
        print ("id should be larger than or equal to 0")
    if idx < 27:
        card_num = int(idx%9) + 1
        card_type = int(idx/9)
        card_type_list = ["M", "C", "L"]
        return card_type_list[card_type] + str(card_num)
    elif idx < 34:
        card_type = int(idx-27)
        card_type_list = ["EW","WW","SW","NW", "RC","GF","WB"]
        return card_type_list[card_type]
    elif idx < 42:
        card_num = int((idx-34))+1
        return "F"+ str(card_num)
        pass
    else:
        print ("id should be less than 144")
        
    return -1

def cardSet2Str(cards):
    cards_str = []
    for i in range(len(cards)):
        cards_str.append(cardID2Str(cards[i]))
    return cards_str

cards = range(42)
print(cardSet2Str(cards))

['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'EW', 'WW', 'SW', 'NW', 'RC', 'GF', 'WB', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8']


In [16]:
def isFlower(card):
    return cardID2Str(card)[0] == 'F'

def showFlowers(cards):
    flowers = []
    flower_index_list = []
    for index, card in enumerate(cards):
        if isFlower(card):
            flowers.append(card)
            flower_index_list.append(index)
    flower_index_list.reverse()
    for index in flower_index_list:
        cards.pop(index)
    return flowers

c = [1,3,3,39,41]
f = [40]
f.extend(showFlowers(c))
if not ( f == [39,40,41] and c == [1,3,3]):
    print("err: c=",c, ", f=", f)

err: c= [1, 3, 3] , f= [40, 39, 41]


In [17]:

H = 64 # hidden layer size

class Brain:
    def __init__(self):
        self.params = {}
        self.model, self.trainer, self.loss = self._create()
        
    def _create(self):
        observation = C.sequence.input_variable(STATE_COUNT, np.float32, name="s")
        q_target = C.sequence.input_variable(ACTION_COUNT, np.float32, name="q")
        
        # Following a style similar to Keras
        l1 = C.layers.Dense(H, activation=C.relu)
        l2 = C.layers.Dense(ACTION_COUNT)
        unbound_model = C.layers.Sequential([l1,l2])
        model = unbound_model(observation)
        
        self.params = dict(W1=l1.W, b1=l1.b, W2=l2.W, b2=l2.b)
        
        # loss = 'mse'
        loss = C.reduce_mean(C.square(model - q_target), axis=0)
        meas = C.reduce_mean(C.square(model - q_target), axis=0)
        
        # optimizer
        lr = 0.00025
        lr_schedule = C.learning_rate_schedule(lr, C.UnitType.minibatch)
        learner = C.sgd(model.parameters, lr_schedule, gradient_clipping_threshold_per_sample=10)
        trainer = C.Trainer(model,(loss,meas),learner)
        
        # CNTK: return trainer and loss as well
        return model, trainer, loss

    def train(self, x, y, epoch=1, verbose=0):
        arguments = dict(zip(self.loss.arguments, [x,y]))
        updated, results = self.trainer.train_minibatch(arguments, outputs = [self.loss.output])
        
    def predict(self, s):
        return self.model.eval([s])


In [18]:
class Memory:  # stored as (s, a, r, s_ )
    samples = []
    
    def __init__(self, capacity):
        self.capacity = capacity
        
    def add(self, sample):
        self.samples.append(sample)
        
        if len(self.samples) > self.capacity:
            self.samples.pop(0)
            
    def sample(self, n):
        n = min(n, len(self.samples))
        return random.sample(self.samples, n)

In [19]:
MEMORY_CAPACITY = 100000
BATCH_SIZE = 64

GAMMA = 0.99 # discount factor

MAX_EPSILON = 1
MIN_EPSILON = 0.01 # stay a bit curious even when getting old
LAMBDA = 0.0001    # speed of decay 

class Agent:
    steps = 0
    epsilon = MAX_EPSILON
    
    def __init__(self):
        self.brain = Brain()
        self.memory = Memory(MEMORY_CAPACITY)
        
    def act(self, s, legal_action=[1]*ACTION_COUNT):
        if random.random() < self.epsilon:
            legal_index_list = []
            for index, value in enumerate(legal_action):
                if value == 1:
                    legal_index_list.append(index)
            return random.choice(legal_index_list)
        else:
            # return numpy.argmax(self.brain.predict(s))
            return numpy.argmax( np.array(self.brain.predict(s)) * np.array(legal_action) )
        
    def observe(self, sample): # in (s, a, r, s_) format
        self.memory.add(sample)
        
        # slowly decrease Epsilon based on our experience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)
        
    def replay(self):
        batch = self.memory.sample(BATCH_SIZE)
        batchLen = len(batch)
        
        no_state = numpy.zeros(STATE_COUNT)
        
        # CNTK: explicity setting to float32
        states = numpy.array([ o[0] for o in batch ], dtype = np.float32)
        states_ = numpy.array([(no_state if o[3] is None else o[3]) for o in batch ] , dtype = np.float32)
        
        p = agent.brain.predict(states)
        p_ = agent.brain.predict(states_)
        
        # CNTK: explicitly setting to float32
        x = numpy.zeros((batchLen, STATE_COUNT)).astype(np.float32)
        y = numpy.zeros((batchLen, ACTION_COUNT)).astype(np.float32)
        
        for i in range(batchLen):
            s, a, r, s_ = batch[i]
            
            # CNTK:[0] beacuse of sequence dimension
            t = p[0][i]
            if s_ is None:
                t[a] = r
            else:
                t[a] = r + GAMMA * numpy.amax(p_[0][i])
                
            x[i] = s
            y[i] = t
        
        self.brain.train(x,y)

In [20]:
def actionID2Str(action):
    actionStr = ""
    # action = win + play(34) + eat(3) + pon + gan(34)
    # action : win:0, play:1~34, eat:35~37, pon:38, gan:39~72, pass:73
        # eat0 = eat downward, ex: eat 5 with 67
        # eat1 = eat middle,   ex: eat 5 with 46
        # eat2 = eat upward    ex: eat 5 with 34
    if action < 0:
        # error
        actionStr = "Error, action should >= 0"
    elif action == 0:
        # win check
        actionStr = "win"
    elif action <= 34:
        # play card with responding index
        cardID = action-1
        actionStr = "play: " + cardID2Str(cardID)

    elif action <= 37:
        new_action = action - 34
        # 3 type of eat
        actionStr = "eat (type "+str(new_action)+")"
    elif action == 38:
        # pon
        actionStr = "pon"
    elif action <= 72:
        # gan with responding index
        cardID = action - 38
        actionStr = "gan: " + cardID2Str(cardID)
    elif action == 73:
        # pass the card (if you're curplaying player, you can/cannot pass, bigyo is still comtemplate)
        actionStr = "pass"
    return actionStr

In [52]:
def makeLegalList(card, cards, frm=0):
    legal_win = [0]
    legal_play = [0]*34
    legal_eat = [0]*3
    legal_pon = [0]
    legal_gan = [0]*34
    legal_pass = [1]
    if checkWin(card, cards):
        legal_win = [1]
    if frm == 0: 
        for val in cards:
            legal_play[val] = 1
        legal_play[card] = 1
        legal_gan = checkLegalGan(card, cards)
        legal_pass = [0]
    if frm == 1:
        # check eat
        # check pon
        legal_pon = [1] if checkPon(card, cards) else [0]
        pass
    if frm == 2:
        # check pon
        legal_pon = [1] if checkPon(card, cards) else [0]
        # check gan
        legal_gan[card] = 1 if checkGanFromOther(card, cards) else 0
        pass

    # action = win + play(34) + eat(3) + pon + gan(34) + pass
    legal_action_list = legal_win + legal_play + legal_eat + legal_pon + legal_gan + legal_pass
    return legal_action_list

def checkPon(card, cards):
    count = 0
    for val in cards:
        if val == card:
            count += 1
    return (count>=2)

def checkGanFromOther(card, cards):
    # cards >= 3
    count = 0
    for val in cards:
        if val == card:
            count += 1
    return (count>=3)

def checkLegalGan(card, cards):
    legal_gan = [0]*34
    counts = [0]*34
    counts[card] += 1
    for val in cards:
        counts[val] += 1
    for i in range(len(counts)):
        if counts[i] == 4:
            legal_gan[i] = 1
        elif counts[i] > 4:
            "checkLegalGan error: how is this possible?"
    return legal_gan

if not checkLegalGan(1, [1,1,1,2,2,2,2]) == [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]:
    print("function checkLegalGan error")
    
if not checkGanFromOther(1, [1,1,1,2,2,2,2]):
    print("function checkGanFromOther error")

In [22]:
def checkWin(card, cards):
    # lots of things to do
    return False

In [92]:
# M1~M9, C1~C9, L1~L9, EW,WW,SW,NW, R,G,W, F1~8
# 36,    72,    108    136, 
# action = win + play(34) + eat(3) + pon + gan(34) + pass
    # eat0 = eat downward, ex: eat 5 with 67
    # eat1 = eat middle,   ex: eat 5 with 46
    # eat2 = eat upward    ex: eat 5 with 34

# Feature_From = [0,1,2], 0=self, 1=prevPlayer, 2=from other players
TABLE_STATE_SIZE = 34      # 34 types of card and their quentities (no flowers)
# TABLE_STATE_SIZE = 64    # beacause there is at most 64 card on the table
HAND_STATE_SIZE = 34
OPEN_STATE_SIZE = 42       # 42 = all types including flowers

# state = from + in_card + hand + open + (table)
STATE_COUNT = 1 + 1 + HAND_STATE_SIZE + OPEN_STATE_SIZE + TABLE_STATE_SIZE
ACTION_COUNT = 1 + 34 + 3 + 1 + 34 + 1

class Environment:
    def __init__(self):
        self.cardSet = CardSet()
        self.players = []
        self.cards = []
        self.cards_open = []
        self.table = []
        self.s = [[],[],[],[]]
        self.a = [0,0,0,0]
        self.r = [0,0,0,0]
        self.s_ = [[],[],[],[]]
        self.cur_player_idx = 0
        
        self.reset()
    
    def reset(self):
        # self.table 
        self.table = []
        self.s = [[],[],[],[]]
        self.a = [0,0,0,0]
        self.r = [0,0,0,0]
        self.s_ = [[],[],[],[]]
        
        # reset 4 player's hand card
        self.players = []
        self.cards = []
        self.cards_open = []
        for i in range(4):
            self.players.append(Agent())
            self.cards.append([])
            self.cards_open.append([])
            
        # reset card deck
        self.cardSet.reset()
        
        # deal cards
        for i in range(16):
            for j in range(4):
                self.cards[j].append(self.cardSet.draw())
                
        # showflower
        self.showFlowersAll()
        
        # sorted
        self.sortCardsAll()
            
        # reset pool
        self.pool = []
    
    def receiveFlower(self, idx, card):
        self.cards_open[idx].append(card)
        self.sortCards(idx)
        
    def receiveCard(self, idx, card):
        self.cards[idx].append(card)
        self.sortCards(idx)
        
    def sortCards(self, idx):
        self.cards[idx].sort()
        self.cards_open[idx].sort()
        
    def sortCardsAll(self):
        for i in range(4):
            self.sortCards(i)
            
    def step(self, idx, card, action): 
        isWin = False
        isEven = False
        playCard = -1
        # action = win + play(34) + eat(3) + pon + gan(34)
            # eat0 = eat downward, ex: eat 5 with 67
            # eat1 = eat middle,   ex: eat 5 with 46
            # eat2 = eat upward    ex: eat 5 with 34
        if action < 0:
            # error
            pass
        elif action == 0:
            # win check
            pass
        elif action <= 34:
            # play card with responding index and remove the card from hand
            playCard = action-1
            self.cards[idx].append(card)
            self.cards[idx].remove(playCard)
            self.table.append(playCard)
        elif action <= 37:
            new_action = action - 34
            # 3 type of eat
        elif action == 38:
            # pon
            ponCard = card
            for i in range(2):
                self.cards[idx].remove(ponCard)
            for i in range(3):
                self.cards_open[idx].append(ponCard)
            pass
        elif action <= 72:
            new_action = action - 38
            # gan with responding index
        elif action == 73:
            # pass the card (if you're curplaying player, you can/cannot pass, bigyo is still comtemplate)
            pass
        
        return isWin, isEven, playCard
    
    def nextPlayer(self):
        self.cur_player_idx += 1
        self.cur_player_idx %= 4
        
    def showFlowers(self, idx):
        self.cards_open[idx].extend(showFlowers(self.cards[idx]))
        
    def showFlowersAll(self):
        for i in range(4):
            self.showFlowers(i)
    
    def makeState(self, idx, card, featureFrom=0):
        # state = from + in_card + hand + open + (table)
        hand = self.encodeCards(self.cards[idx])
        cards_open = self.encodeCards(self.cards_open[idx], 42)  # cards which are opened
        table = self.encodeCards(self.table)
        
        state = [featureFrom] + [card] + hand + cards_open + table 
        if not len(state) == STATE_COUNT:
            print("ERROR: state len wrong !! len(state) = %d, STATE_COUNT = %d"%(len(state), STATE_COUNT))
        
        return state
    
    def encodeCards(self, cards, num=34):
        enc = [0] * num
        for card in cards:
            enc[card] += 1
        return enc

In [93]:
def render(env):
    env.sortCardsAll()
    print("Deck: \n",cardSet2Str(env.cardSet.cards))
    print("Table: \n", cardSet2Str(env.table))
    for i in range(4):
        print("P%d: \n\t"%(i), cardSet2Str(env.cards[i]), "\n\t", cardSet2Str(env.cards_open[i]))

In [94]:
# action priority
def actionPriority(action):
    # action : win:0, play:1~34, eat:35~37, pon:38, gan:39~72, pass:73
    if action < 0:
        print("actionPriority Error: action should >= 0")
        return
    if action == 0:
        return 3 #win
    if action in range(1,35): 
        return 0
    if action in range(35,38):
        return 1 #eat
    if action == 38:
        return 2 #pon
    if action in range(39,73):
        return 2 #gan
    if action == 73:
        return 0
    

In [95]:
def run(env):
    
    turn = 0 # for save, if turn is too large, break.
    save_break_turn = 20
    while True:
        turn += 1  # for save, if turn is too large, break.
        if turn >= save_break_turn:
            break
        # 1.start
        render(env)

        # while loop unitl sb win or game draw
        idx = env.cur_player_idx
        # 1.1. Draw and act by idx
        drawCard = env.cardSet.draw()
        print("P%d => draw: %s."%(idx, cardID2Str(drawCard)))

        while isFlower(drawCard):
            env.receiveFlower(idx, drawCard)
            drawCard = env.cardSet.draw()
            print("P%d => draw: %s."%(idx, cardID2Str(drawCard)))

        # self, draw card and react
        from_idx = 0 
        # make state to predict action
        env.s_[idx] = env.makeState(idx, drawCard)
        # i want to add observer for the last action because i cannot get the next_state right after action
        if not env.s[idx] == []:
            print("TODO env.player.observe")
            # env.player.obs(s,a,r,s_)
        # cur index player act respond with this s
        env.a[idx] = env.players[idx].act(env.s_[idx] , makeLegalList(drawCard, env.cards[idx],from_idx))
        actionStr = actionID2Str(env.a[idx])
        # 1.2. react with the drawing card
        isWin, isEven, playCard = env.step(idx, drawCard, env.a[idx])
        print("P%d => %s."%(idx, actionStr))

        while True:
            turn += 1  # for save, if turn is too large, break.
            if turn >= 5:
                break
            # 1.2.1 check is the game end, if not, continue
            # 1.3. idx play one card, other player react
            # 1.3.0: init priority and index
            react_priority = 0
            react_player = -1  # -1 means nobody react
            react_action = -1
            for i_circular in range(1,4):
                i = (idx+i_circular) % 4
                if i_circular == 1:
                    from_idx = 1 # pre player
                elif i_circular in [2,3]:
                    from_idx = 2 # other players

                # 1.3.1. ask three other players if they want to do something
                # make state to predict action
                env.s_[i] = env.makeState(i, playCard, from_idx)
                # i want to add observer for the last action because i cannot get the next_state right after action
                if not env.s[i] == []:
                    print("TODO env.player.observe")
                    # env.player.obs(s,a,r,s_)
                # ask player what they would do with this state
                act = env.players[i].act(env.s_[i] , makeLegalList(playCard, env.cards[i],from_idx))
                actionStr = actionID2Str(act)
                print("\tP%d want to: %s."%(i, actionStr))
                # updata priority and check if the player got the priority
                prior = actionPriority(act)
                if prior > react_priority:
                    react_priority = prior
                    react_player = i
                    react_action = act

            # 1.3.2. arrange priority and select the player to do his/her action 
            if react_priority == 0:
                break
            else:
                # todo: reset curplayer
                env.cur_player_idx = react_player
                idx = env.cur_player_idx
                print("Priority != 0")
                env.a[idx] = react_action # save the reacting action to the player
                actionStr = actionID2Str(env.a[idx])
                # 1.2. react with the drawing card
                isWin, isEven, playCard = env.step(idx, playCard, env.a[idx])
                print("P%d => %s."%(idx, actionStr))
                # play one card
                act = env.players[idx]
                env.s_[idx] = env.makeState(idx, -1)
                if not env.s[idx] == []:
                    print("TODO env.player.observe")
                    # env.player.obs(s,a,r,s_)
                # cur index player act respond with this s
                env.a[idx] = env.players[idx].act(env.s_[idx] , makeLegalList(drawCard, env.cards[idx], 0))
                actionStr = actionID2Str(env.a[idx])
                # 1.2. react with the drawing card
                isWin, isEven, playCard = env.step(idx, drawCard, env.a[idx])
                print("P%d => %s."%(idx, actionStr))



        # 1.4. switch to next player
        env.nextPlayer()
        print("nextPlayer\n")
        # return observation, reward, done, info

In [96]:
env = Environment()
run(env)

Deck: 
 ['C6', 'C5', 'M8', 'L2', 'M1', 'C2', 'F2', 'SW', 'L9', 'C3', 'M2', 'GF', 'L1', 'RC', 'M9', 'L3', 'M9', 'WB', 'C7', 'NW', 'L1', 'L3', 'RC', 'M6', 'L5', 'SW', 'M3', 'M7', 'M9', 'L1', 'M2', 'WW', 'M5', 'WW', 'M3', 'L7', 'WB', 'C1', 'C5', 'GF', 'C3', 'GF', 'C4', 'L4', 'C9', 'L7', 'M6', 'C9', 'C3', 'L2', 'WB', 'M8', 'RC', 'M1', 'M2', 'C9', 'C5', 'C1', 'C7', 'M1', 'L6', 'C3', 'C8', 'C6', 'EW', 'L6', 'L1', 'L3', 'C7', 'C6', 'M4', 'M5', 'L9', 'F7', 'EW', 'M7', 'F8', 'F5', 'M5', 'M4']
Table: 
 []
P0: 
	 ['M1', 'M2', 'M3', 'M7', 'C4', 'C4', 'C7', 'C9', 'L4', 'L4', 'L5', 'L5', 'L9', 'EW', 'SW'] 
	 ['F3']
P1: 
	 ['M3', 'M4', 'M6', 'M8', 'C1', 'C2', 'C2', 'C8', 'C8', 'L2', 'L4', 'SW', 'NW', 'NW', 'RC', 'WB'] 
	 []
P2: 
	 ['M4', 'M5', 'M6', 'C2', 'C4', 'C5', 'L5', 'L6', 'L7', 'L8', 'L9', 'WW', 'NW', 'GF'] 
	 ['F1', 'F4']
P3: 
	 ['M7', 'M8', 'M9', 'C1', 'C6', 'C8', 'L2', 'L3', 'L6', 'L7', 'L8', 'L8', 'L8', 'EW', 'WW'] 
	 ['F6']
P0 => draw: C6.
P0 => play: M2.
	P1 want to: pass.
	P2 want to: p

In [None]:
for i in range(20):
    env = Environment()
    run(env)

In [None]:
for i in range(200):
    env = Environment()
    run(env)

In [20]:
# todo : make other play take acts
# todo : check eat,
# todo : arrange priority of the other 3 players action and select the top one to do the action

# todo : normalize state parameter