# 오델로 강화학습

**주요 기능**
- Gym 인터페이스를 통한 오델로 환경 상호작용 
- human vs cpu , human vs human, cpu vs cpu 시뮬레이션 기능
  
**테스트 코드 요약**

1. Dqn + Greedy Policy learning
2. DQN + UCT Policy learning
3. DQN + UCT Policy + DQN + UCT Policy 경쟁 학습

# Gym 초기화

In [1]:
import gym_games
import gymnasium as gym
from DeepQResNet import DQN
import numpy as np
import math
import random

env = gym.make('Othello-v0',render_mode='human')
state_shape = (8, 8, 1)


## Dqn + Greedy Policy learning

In [None]:
env.metadata['autoplay'] = True
env.metadata['render_fps'] = 1000000


obs, reward, bdone, _, info = env.reset()
dqn = DQN(state_shape,env.action_space.n)
version = -1
#dqn.load( f"model/Greedy/BlackModel_{version}.weights.h5")
count = version +1
def GetPolicy(bdone,wdone,turn,obs,actions):
    if(not actions):
        return None
    if(turn==1):
        if(bdone):
            return None
    else:
        if(wdone):
            return None
    return dqn.EstimatePolicy(obs,turn,actions)
def InsertBuffer(turn,oldobs, action, reward,obs,done,actions):
    dqn.InsertBuffer(oldobs, action, reward,obs,done,actions,turn)

for episode in range(1,500):
    bdone = False
    wdone = False
    obs, reward, done, _, info = env.reset()
    steps = 0
    while (not bdone) or (not wdone):
        actions = info['action']
        turn = info['turn']
        oldobs = obs
        action = GetPolicy(bdone,wdone,turn,obs,actions)
        obs, reward, done, _, info = env.step(action)
        if done:
            if info['turn'] ==2:
                wdone = True
            else:
                bdone = True
        if(action is not None):
            InsertBuffer(turn,oldobs, action, reward,obs,done,info['action'])
        steps += 1
    loss = dqn.train()
    if loss is not None:
        print(f"Episode {episode + 1},=Loss: {loss}\n")
    if(episode % 30 == 0):
        print("update Target Model")
        dqn.update_target_model()
    if(episode % 100 == 0):
        print('save \n')
        dqn.save(f"model/Greedy/Model_{count}.weights.h5")
        count+=1

## DQN + UCT Policy learning

In [None]:

env.metadata['autoplay'] = True
env.metadata['render_fps'] = 1500000
obs, reward, bdone, _, info = env.reset()
dqn = DQN(state_shape,env.action_space.n)

version = -1
#dqn.load( f"model/UCT/Model_{version}.weights.h5")
Count = version +1
def GetPolicy(bdone,wdone,turn,env,obs,actions):
    if(not actions):
        return None
    if(turn==1):
        if(bdone):
            return None
    else:
        if(wdone):
            return None
    return dqn.BehaviorPolicy(env,obs,turn,actions)
def InsertBuffer(turn,oldobs, action, reward,obs,done,actions):
    dqn.InsertBuffer(oldobs, action, reward,obs,done,actions,turn)
for episode in range(1,400):
    bdone = False
    wdone = False
    obs, reward, done, _, info = env.reset()
    steps = 0
    while (not bdone) or (not wdone):
        actions = info['action']
        turn = info['turn']
        oldobs = obs
        action = GetPolicy(bdone,wdone,turn,env,obs,actions)
        obs, reward, done, _, info = env.step(action)
        if done:
            if info['turn'] ==2:
                wdone = True
            else:
                bdone = True
        if(action is not None):
            InsertBuffer(turn,oldobs, action, reward,obs,done,actions)
        steps += 1
    loss = dqn.train()
    if loss is not None:
        print(f"Episode {episode + 1}, Loss: {loss}\n")
    if(episode % 30 == 0):
        print("update Target Model")
        dqn.update_target_model()
    if(episode % 100 == 0):
        print('save ')
        dqn.save(f"model/UCT/Model_{Count}.weights.h5")
        Count+=1

## 경쟁 학습

In [3]:
env.metadata['autoplay'] = True
env.metadata['render_fps'] = 150000
obs, reward, bdone, _, info = env.reset()
state_shape = (8, 8, 1)  # Adding the channel dimension
Blackdqn = DQN(state_shape,env.action_space.n)
Whitedqn = DQN(state_shape,env.action_space.n)

version = -1
# Blackdqn.load( f"model/UCT/Black/Model_{version}.weights.h5")
# Whitedqn.load(f"model/UCT/White/Model_{version}.weights.h5")
BlackmodelCount = version +1
WhitemodelCount = version +1
def GetPolicy(bdone,wdone,turn,env,obs,actions):
    if(not actions):
        return None
    if(turn==1):
        if(bdone):
            return None
        return Blackdqn.BehaviorPolicy(env,obs,turn,actions)
    else:
        if(wdone):
            return None
        return Whitedqn.BehaviorPolicy(env,obs,turn,actions)
def InsertBuffer(turn,oldobs, action, reward,obs,done,actions):
    Blackdqn.InsertBuffer(oldobs, action, reward,obs,done,actions,turn)
    Whitedqn.InsertBuffer(oldobs, action, reward,obs,done,actions,turn)
for episode in range(1,10000):
    bdone = False
    wdone = False
    obs, reward, done, _, info = env.reset()
    steps = 0
    while (not bdone) or (not wdone):
        actions = info['action']
        turn = info['turn']
        oldobs = obs
        action = GetPolicy(bdone,wdone,turn,env,obs,actions)
        obs, reward, done, _, info = env.step(action)
        if done:
            if info['turn'] ==2:
                wdone = True
            else:
                bdone = True
        if(action is not None):
            InsertBuffer(turn,oldobs, action, reward,obs,done,actions)
        steps += 1
    blakloss = Blackdqn.train()
    whiteloss = Whitedqn.train()
    if blakloss is not None:
        print(f"Episode {episode + 1}, BlackLoss: {blakloss}\n")
        Blackdqn.update_target_model()
    if whiteloss is not None:
        print(f"Episode {episode + 1}, WhiteLoss: {whiteloss}\n")
        Whitedqn.update_target_model()
    if(episode % 30 == 0):
        print("update Target Model")
        Blackdqn.update_target_model()
        Whitedqn.update_target_model()
    if(episode % 100 == 0):
        print('save ')
        Whitedqn.save(f"model/BlackModel_{WhitemodelCount}.weights.h5")
        Blackdqn.save(f"model/WhiteModel_{BlackmodelCount}.weights.h5")
        BlackmodelCount+=1
        WhitemodelCount+=1

Episode 4, BlackLoss: 1.614793300628662

Episode 4, WhiteLoss: 1.7577934265136719

Episode 6, BlackLoss: 1.349267601966858

Episode 6, WhiteLoss: 1.512926697731018

Episode 8, BlackLoss: 1.1248804330825806

Episode 8, WhiteLoss: 1.2767577171325684

Episode 10, BlackLoss: 0.9338687658309937

Episode 10, WhiteLoss: 1.0718914270401

Episode 12, BlackLoss: 0.7854718565940857

Episode 12, WhiteLoss: 0.9076473116874695

Episode 14, BlackLoss: 0.6724572777748108

Episode 14, WhiteLoss: 0.7806628346443176

Episode 16, BlackLoss: 0.5856212973594666

Episode 16, WhiteLoss: 0.6815481781959534

Episode 19, BlackLoss: 0.5188606977462769

Episode 19, WhiteLoss: 0.6032830476760864

Episode 21, BlackLoss: 0.46567270159721375

Episode 21, WhiteLoss: 0.5406937003135681

Episode 23, BlackLoss: 0.4225471615791321

Episode 23, WhiteLoss: 0.4898558557033539

Episode 25, BlackLoss: 0.3868508040904999

Episode 25, WhiteLoss: 0.4479089677333832

Episode 27, BlackLoss: 0.35697707533836365

Episode 27, WhiteLoss

KeyboardInterrupt: 

# 테스트

**방법**
1. load를 통해 원하는 모델 선택
2. BlackPlay를 지정해 돌 색 선택
   
**notice**

더 이상 둘 수 있는 수가 없다면 자동으로 턴이 넘어 갑니다.
게임이 완료되었다는 문구 출력을 만들지 않았습니다.

In [2]:
import random
import time

BlackPlay = False # 사람이 무슨 색 돌로 시작할 건지

env.metadata['render_fps'] = 60
env.metadata['autoplay'] = not BlackPlay
obs, reward, done, _, info = env.reset()
dqn = DQN(state_shape,env.action_space.n)
dqn.load('model/UCT/Black/BlackModel_6.weights.h5') # 테스트 모델 선택
bdone = False
wdone = False
def GetPolicy(bdone,wdone,turn,env,obs,actions):
    if(not actions):
        return None
    if(turn==1):
        if(bdone):
            return None
    else:
        if(wdone):
            return None
    if(not env.metadata['autoplay']):
        return None
    return dqn.EstimatePolicy(obs,turn,actions)

while (not bdone) or (not wdone):
    actions = info['action']
    turn = info['turn']
    oldobs = obs
    time.sleep(0.5)
    action = GetPolicy(bdone,wdone,turn,env,obs,actions)
    obs, reward, done, _, info = env.step(action)
    env.metadata['autoplay'] = not env.metadata['autoplay']
    env.render()
    if done:
        if info['turn'] ==2:
            wdone = True
        else:
            bdone = True
print('Done')

  logger.warn(
  logger.warn(


Done


In [None]:
env.close()