In [5]:
rewards = []

In [6]:
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

class NeuralAgent:
    def __init__(self, alpha=0.5, epsilon=0.3, gamma=0.9):
        self.alpha = alpha  # коэффициент обучения
        self.epsilon = epsilon  # коэффициент исследования
        self.gamma = gamma  # коэффициент дисконтирования
        self.q_table = {}  # таблица Q-значений
        self.model = self.build_model()
    
    def build_model(self):
        model = Sequential()
        model.add(Dense(64, input_dim=9, activation='relu'))  # Входной слой с 64 нейронами
        model.add(Dense(64, activation='relu'))  # Скрытый слой с 64 нейронами
        model.add(Dense(9, activation='linear'))  # Выходной слой с 9 нейронами (по одному на каждое действие)
        model.compile(loss='mean_squared_error', optimizer='adam')  # Компиляция модели
        return model
    
    def get_possible_actions(self, state):
        return [i for i, value in enumerate(state) if value == 0]
    
    def choose_action(self, state, possible_actions):
        if random.uniform(0, 1) < self.epsilon:
            action = random.choice(possible_actions)  # случайное действие
        else:
            state = np.array([state])  # Преобразование состояния в двумерный numpy массив
            q_values = self.model.predict(state)  # Получаем Q-значения из нейронной сети
            q_values = q_values[0]
            for i in range(len(q_values)):
                if i not in possible_actions:
                    q_values[i] = -np.inf  # Игнорируем недоступные действия
            action = np.argmax(q_values)
        return action
    
    def learn(self, state, action, reward, next_state):
        state = np.array([state])  # Преобразование состояния в двумерный numpy массив
        next_state = np.array([next_state])  # Преобразование следующего состояния в двумерный numpy массив
        target = reward + self.gamma * np.max(self.model.predict(next_state)[0])
        q_values = self.model.predict(state)
        q_values[0][action] = (1 - self.alpha) * q_values[0][action] + self.alpha * target
        self.model.fit(state, q_values, epochs=1, verbose=0)

In [7]:
def check(state, symbol):
        winning_combinations = [[0, 1, 2], [1, 2, 0], [3, 4, 5], [4, 5, 3], [6, 7, 8], [7, 8, 6], [0, 2, 1], [3, 5, 4], [6, 8, 7], # горизонтальные
                            [0, 3, 6], [3, 6, 0], [2, 5, 8], [5,8, 2], [1, 4, 7], [4, 7, 1], [0, 6, 3], [1, 7, 4], [2, 8, 5],  # вертикальные
                            [0, 8, 4], [0, 4, 8], [4,8, 0], [2, 4, 6], [4, 6, 2], [2, 6, 4]]  # диагональные
    
        for combination in winning_combinations:
            if state[combination[0]] == state[combination[1]] == symbol:
                return { True: combination[2] }
        return { False: combination[0] }

In [8]:
class DefaultAgent:
    
    def get_possible_actions(self, state):
        return [i for i, value in enumerate(state) if value == 0]
    
    def choose_action(self, state, possible_actions):
        result_check = check(state, -1)
        for key in result_check:
            if key:
                action = result_check[key]
            else:
                action = random.choice(possible_actions)
        return action

In [9]:
def play_game(agent1, agent2):
    state = [0, 0, 0, 0, 0, 0, 0, 0, 0]  # начальное состояние доски
    game_over = False
    
    while not game_over:
        # Ход первого агента
        action1 = agent1.choose_action(tuple(state), agent1.get_possible_actions(state))
        state[action1] = 1
        
        # Проверка на победу/ничью
        if check_winner(state, 1):
            rewards.append(1)
            agent1.learn(tuple(state), action1, 1, tuple(state))
            game_over = True
            break

        if not 0 in state:  # ничья
            rewards.append(0)
            agent1.learn(tuple(state), action1, 0, tuple(state))
            game_over = True
            break

        # Ход второго агента
        action2 = agent2.choose_action(tuple(state), agent1.get_possible_actions(state))
        state[action2] = -1

        # Проверка на победу/ничью
        if check_winner(state, -1):
            rewards.append(-1)
            agent1.learn(tuple(state), action1, -1, tuple(state))
            game_over = True
            break

    return game_over

def check_winner(state, symbol):
    winning_combinations = [[0, 1, 2], [3, 4, 5], [6, 7, 8],  # горизонтальные
                            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # вертикальные
                            [0, 4, 8], [2, 4, 6]]  # диагональные
    
    for combination in winning_combinations:
        if state[combination[0]] == state[combination[1]] == state[combination[2]] == symbol:
            return True
    return False

In [None]:
training_volume = 1

In [None]:
agent1 = NeuralAgent()
agent2 = DefaultAgent()

total_games = 1000 * training_volume

for i in range(total_games):
    play_game(agent1, agent2)

print("Studying complete!")

analysis_rewards = []

counter = 0
for index, data in enumerate(rewards):
    if data == 1:
        counter = counter + 1
    if index % training_volume == 0:
        analysis_rewards.append(counter)
        counter = 0