In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm
from torch.utils.data import DataLoader, TensorDataset
from environment import Env

In [2]:
device = torch.device("cuda:0")
ALPHA = 0.9

In [22]:
class net(nn.Module):
    def __init__(self):
        super(net, self).__init__()
        self.conv1 = nn.Conv2d(2, 4, kernel_size=3)
        self.conv2 = nn.Conv2d(4, 8, kernel_size=5, stride=2, padding=1)
        # self.conv3 = nn.Conv2d(8, 16, kernel_size=3)
        self.flat = nn.Flatten()
        self.linear = nn.Linear(288, 225)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        out1 = self.conv1(x)
        out2 = self.conv2(out1)
        out3 = self.flat(out2)
        # out4 = self.softmax(self.linear(out3))
        out4 = self.linear(out3)
        return out4

In [23]:
def player_move(player, model_eval, model_target, game, device):
    chess_map, not_pos = game.observe()
    situ = torch.Tensor(chess_map).to(device)
    situ = torch.reshape(situ, (1, 2, 15, 15))
    # print(situ.shape)
    out_eval = model_eval(situ)
    out_target = model_target(situ)
    # print(out_eval)
    out_target = out_target.cpu().detach().numpy().reshape((225,))
    out_eval = out_eval.cpu().detach().numpy()
    move_matrix = out_eval.reshape((225,))
    # 转换为一维数组下标
    not_pos = not_pos[:, 0] * 15 + not_pos[:, 1]
    # 删除不可落子的位置
    # print(move_matrix.shape)
    move_matrix[not_pos] = move_matrix[not_pos] - 99
    out_target[not_pos] = out_target[not_pos] * 0
    max_pos = int(np.argmax(move_matrix))
    # print(move_matrix.shape)
    # print(max_pos)
    move_pos = np.unravel_index(max_pos, chess_map[0, :, :].shape)
    # 进行落子
    state = game.move(player, move_pos[0], move_pos[1])
    return chess_map, state, out_target, move_pos

In [35]:
def learn(epoch):
    eval_0 = net().to(device)
    eval_1 = net().to(device)
    target_0 = net().to(device)
    target_1 = net().to(device)
    criterion = nn.MSELoss(reduction='mean')
    optimizer_0 = optim.Adam(eval_0.parameters(), lr=0.001)
    optimizer_1 = optim.Adam(eval_1.parameters(), lr=0.001)
    for i in range(epoch):
        if i % 5 == 0:
            target_0 = eval_0
            target_1 = eval_1
        # 进行游戏
        memory_situation_0 = []
        memory_target_0 = []
        memory_situation_1 = []
        memory_target_1 = []
        move_memory = pd.DataFrame(columns=['player', 'move_x', 'move_y'])
        game = Env()
        for _ in range(225):
            # 白棋落子
            chess_map, state, target, move_pos_0 = player_move(0, eval_0, target_0, game, device)
            move_memory.loc[len(move_memory)] = [0, move_pos_0[0], move_pos_0[1]]
            target *= ALPHA
            memory_situation_0.append(chess_map.tolist())
            memory_target_0.append(target.tolist())
            if state == True:
                move_pos_0 = move_pos_0[0] * 15 + move_pos_0[1]
                move_pos_1 = move_pos_1[0] * 15 + move_pos_1[1]
                # 奖励自己最后一步
                memory_target_0[-1][int(move_pos_0)] += 1
                # 惩罚对手最后一步
                memory_target_1[-1][int(move_pos_1)] += -0.9
                break
            # 黑棋落子
            chess_map, state, target, move_pos_1 = player_move(1, eval_1, target_1, game, device)
            move_memory.loc[len(move_memory)] = [1, move_pos_1[0], move_pos_1[1]]
            target *= ALPHA
            memory_situation_1.append(chess_map.tolist())
            memory_target_1.append(target.tolist())
            if state == True:
                move_pos_0 = move_pos_0[0] * 15 + move_pos_0[1]
                move_pos_1 = move_pos_1[0] * 15 + move_pos_1[1]
                # 奖励自己最后一步
                memory_target_1[-1][int(move_pos_1)] += 1
                # 惩罚对手最后一步
                memory_target_0[-1][int(move_pos_0)] += -0.9
                break
        # print("game over!")
        move_memory.to_csv('move_dir/game_' + str(i) + '.csv', sep=',')
        # 整合成dataloader
        m_s_0 = np.array(memory_situation_0)
        m_t_0 = np.array(memory_target_0)
        m_s_1 = np.array(memory_situation_1)
        m_t_1 = np.array(memory_target_1)
        m_s_0 = torch.Tensor(m_s_0).to(torch.float32)
        m_t_0 = torch.Tensor(m_t_0).to(torch.float32)
        m_s_1 = torch.Tensor(m_s_1).to(torch.float32)
        m_t_1 = torch.Tensor(m_t_1).to(torch.float32)
        m_0 = TensorDataset(m_s_0, m_t_0)
        m_1 = TensorDataset(m_s_1, m_t_1)
        memoryLoader_0 = DataLoader(m_0, batch_size=8)
        memoryLoader_1 = DataLoader(m_0, batch_size=8)
        
        # 训练
        train(eval_0, memoryLoader_0, optimizer_0, criterion, device, 10, i)
        train(eval_1, memoryLoader_1, optimizer_1, criterion, device, 10, i)
        
    print("done")

In [36]:
def train(model, dataloader, optimizer, criterion, device, epoch, step):
    game_log = pd.DataFrame(columns=['step', 'epoch', 'loss'])
    for i in range(epoch):
        model.train()
        train_loss = 0.0
        for _, data in enumerate(dataloader):
            optimizer.zero_grad()
            situ, target = data[0].to(device), data[1].to(device)
            output = model(situ)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss += loss
        # print("epoch %d: loss = %f"%(i, train_loss))
        game_log.loc[len(game_log)] = [step, i, train_loss.cpu().detach().numpy()]
    game_log.to_csv('game_log.csv', sep=',', index=0)

In [38]:
learn(2000)

KeyboardInterrupt: 

In [132]:
torch.cuda.device_count()

1