In [1]:
import gym
from collections import namedtuple
import numpy as np
import math
import random

import torch
import torch.nn as nn
import torch.optim as optim

import matplotlib.pyplot as plt

In [2]:
# проблема: учит двигаться в одну сторону, напр то 0 до 60
# решение: на вых направление [0, 1], скорость [45]

In [3]:
# actions = [0, 1] = [-1, 1]

class Game:
    def __init__(self, angle_degree, distance):
        self.angle_degree = angle_degree
        self.distance = distance
        self.velocity = random.randint(1, 200)
        self.distance_to_target = distance
        self.record_distance = distance / 4
        self.n_games = 0
        self.epsilon = 0
        
    def reset(self):
#         self.velocity = random.choice(range(200))
        self.velocity = random.randint(1, 200)
        self.distance_to_target = self.distance
        self.record_distance = self.distance / 4
#         print('reset')
        return np.array([self.angle_degree, self.distance, self.distance_to_target, self.velocity])

    def calculate_distance_to_target(self, v0, angle_degree):
        g = 9.8
        angle_radian = math.radians(angle_degree)
        distance = (v0**2 * math.sin(2 * angle_radian)) / g
        return distance
    
    def step(self, action):
        
        self.n_games += 1

        if action == 0:
            self.velocity -= 1
        else:
            self.velocity += 1

        # move
        calc_distance = self.calculate_distance_to_target(self.velocity, self.angle_degree)

        new_distance_to_target = abs(self.distance - calc_distance)

        # reward
#         reward = abs((self.distance - distance) / distance_to_target**2)
        if new_distance_to_target < self.distance_to_target:
            self.distance_to_target = new_distance_to_target
            reward = 1
        else:
            reward = 0
            
        if new_distance_to_target < self.record_distance or new_distance_to_target > self.distance * 2:
            self.record_distance = new_distance_to_target
            is_done = True
        else:
            is_done = False
            
        # update ui
#         print('distance_to_target', new_distance_to_target)
        
        state = np.array([self.angle_degree, self.distance, new_distance_to_target, self.velocity])

        return state, reward, is_done

In [4]:
HIDDEN_SIZE = 128
BATCH_SIZE = 128
PERCENTILE = 70

In [5]:
class Net(nn.Module):
    def __init__(self, state_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
    
    def forward(self, x):
        return self.net(x)

In [6]:
# sum of rewards in episode
Episode = namedtuple('Episode', field_names=['reward', 'steps'])
# one step in episode
Step = namedtuple('Step', field_names=['state', 'action'])

In [7]:
def iterate_batches(env, model, batch_size):
    batch = [] 
    episode_reward = 0.0
    episode_steps = [] 
    state = env.reset()
    softmax = nn.Softmax(dim=1)
    
    while True:
        env.epsilon = 180 - env.n_games
        if random.randint(0, 1200) < env.epsilon:
            action = random.randint(0, 1)
        else:
            state_v = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            pred = model(state_v)
            act_probs_v = softmax(pred)
            act_probs = act_probs_v.data.numpy()[0]

            action = np.random.choice(len(act_probs), p=act_probs)
        
        next_state, reward, is_done = env.step(action)
        
        episode_reward += reward
        episode_steps.append(Step(state=state, action=action))
        
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward = 0.0
            episode_steps = []
            next_state = env.reset()
            
            if len(batch) == batch_size:
                yield batch
                batch = []
                
        state = next_state

In [8]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile) 
    reward_mean = float(np.mean(rewards))
    
    train_states = []
    train_actions = []
    
    for el in batch:
        if el.reward < reward_bound:
            continue
        
        train_states.extend(map(lambda step: step.state, el.steps))
        train_actions.extend(map(lambda step: step.action, el.steps))
        
    train_states_v = torch.tensor(train_states, dtype=torch.float)
    train_actions_v = torch.tensor(train_actions, dtype=torch.long)
    
    return train_states_v, train_actions_v, reward_bound, reward_mean

In [9]:
env = Game(30, 100)
state_size = 4
n_actions = 2

log_loss = []
log_reward_bound = []
log_reward_mean = []

model = Net(state_size, HIDDEN_SIZE, n_actions)
loss_fn = nn.CrossEntropyLoss()
# loss_fn = nn.MSELoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.01)

for i, batch in enumerate(iterate_batches(env, model, BATCH_SIZE)):
    states_v, actions_v, reward_bound, reward_mean = filter_batch(batch, PERCENTILE)
    
    optimizer.zero_grad()
    
    action_scores_v = model(states_v)
    loss_v = loss_fn(action_scores_v, actions_v)
    
    loss_v.backward()
    optimizer.step()
    
    print('%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f' % (i, loss_v.item(), reward_mean, reward_bound))
    
    log_loss.append(loss_v.item())
    log_reward_bound.append(reward_bound)
    log_reward_mean.append(reward_mean)
                           
    if reward_mean > 199:
        print('Solved!')
        break

  train_states_v = torch.tensor(train_states, dtype=torch.float)


0: loss=0.625, reward_mean=2.5, reward_bound=0.0
1: loss=0.000, reward_mean=2.7, reward_bound=0.0
2: loss=0.000, reward_mean=2.6, reward_bound=0.0
3: loss=0.000, reward_mean=2.4, reward_bound=0.0
4: loss=0.000, reward_mean=2.1, reward_bound=0.0
5: loss=0.000, reward_mean=2.2, reward_bound=0.0
6: loss=0.000, reward_mean=2.5, reward_bound=0.0
7: loss=0.000, reward_mean=2.3, reward_bound=0.0
8: loss=0.000, reward_mean=1.8, reward_bound=0.0
9: loss=0.000, reward_mean=2.7, reward_bound=0.0
10: loss=0.000, reward_mean=2.9, reward_bound=0.0
11: loss=0.000, reward_mean=2.3, reward_bound=0.0
12: loss=0.000, reward_mean=1.9, reward_bound=0.0
13: loss=0.000, reward_mean=1.8, reward_bound=0.0
14: loss=0.000, reward_mean=2.7, reward_bound=0.0
15: loss=0.000, reward_mean=3.1, reward_bound=0.0
16: loss=0.000, reward_mean=2.0, reward_bound=0.0
17: loss=0.000, reward_mean=2.2, reward_bound=0.0
18: loss=0.000, reward_mean=2.4, reward_bound=1.0
19: loss=0.000, reward_mean=2.0, reward_bound=0.0
20: loss=0

164: loss=0.000, reward_mean=1.7, reward_bound=0.0
165: loss=0.000, reward_mean=2.9, reward_bound=0.0
166: loss=0.000, reward_mean=2.3, reward_bound=0.0
167: loss=0.000, reward_mean=2.7, reward_bound=0.0
168: loss=0.000, reward_mean=2.5, reward_bound=0.0
169: loss=0.000, reward_mean=2.7, reward_bound=1.0
170: loss=0.000, reward_mean=2.9, reward_bound=0.0
171: loss=0.000, reward_mean=2.5, reward_bound=0.0
172: loss=0.000, reward_mean=1.5, reward_bound=0.0
173: loss=0.000, reward_mean=2.0, reward_bound=0.0
174: loss=0.000, reward_mean=2.3, reward_bound=0.0
175: loss=0.000, reward_mean=2.1, reward_bound=0.0
176: loss=0.000, reward_mean=2.1, reward_bound=0.0
177: loss=0.000, reward_mean=1.9, reward_bound=0.0
178: loss=0.000, reward_mean=2.2, reward_bound=0.0
179: loss=0.000, reward_mean=3.2, reward_bound=1.0
180: loss=0.000, reward_mean=2.0, reward_bound=0.0
181: loss=0.000, reward_mean=2.4, reward_bound=0.0
182: loss=0.000, reward_mean=1.6, reward_bound=0.0
183: loss=0.000, reward_mean=1.

325: loss=0.000, reward_mean=2.9, reward_bound=0.0
326: loss=0.000, reward_mean=2.2, reward_bound=0.0
327: loss=0.000, reward_mean=1.7, reward_bound=0.0
328: loss=0.000, reward_mean=2.4, reward_bound=0.0
329: loss=0.000, reward_mean=1.9, reward_bound=0.0
330: loss=0.000, reward_mean=1.6, reward_bound=0.0
331: loss=0.000, reward_mean=2.2, reward_bound=0.0
332: loss=0.000, reward_mean=2.3, reward_bound=0.0
333: loss=0.000, reward_mean=2.6, reward_bound=0.0
334: loss=0.000, reward_mean=3.0, reward_bound=0.0
335: loss=0.000, reward_mean=2.7, reward_bound=0.0
336: loss=0.000, reward_mean=1.8, reward_bound=0.0
337: loss=0.000, reward_mean=2.0, reward_bound=0.0
338: loss=0.000, reward_mean=2.6, reward_bound=0.0
339: loss=0.000, reward_mean=2.0, reward_bound=0.0
340: loss=0.000, reward_mean=1.7, reward_bound=0.0
341: loss=0.000, reward_mean=2.7, reward_bound=0.0
342: loss=0.000, reward_mean=3.2, reward_bound=0.0
343: loss=0.000, reward_mean=2.1, reward_bound=0.0
344: loss=0.000, reward_mean=1.

486: loss=0.000, reward_mean=2.2, reward_bound=0.0
487: loss=0.000, reward_mean=1.9, reward_bound=0.0
488: loss=0.000, reward_mean=1.8, reward_bound=0.0
489: loss=0.000, reward_mean=2.5, reward_bound=0.0
490: loss=0.000, reward_mean=1.8, reward_bound=0.0
491: loss=0.000, reward_mean=2.0, reward_bound=0.0
492: loss=0.000, reward_mean=2.4, reward_bound=0.0
493: loss=0.000, reward_mean=2.6, reward_bound=0.0
494: loss=0.000, reward_mean=1.7, reward_bound=0.0
495: loss=0.000, reward_mean=2.7, reward_bound=0.0
496: loss=0.000, reward_mean=1.7, reward_bound=0.0
497: loss=0.000, reward_mean=2.6, reward_bound=0.0
498: loss=0.000, reward_mean=1.6, reward_bound=0.0
499: loss=0.000, reward_mean=2.7, reward_bound=0.0
500: loss=0.000, reward_mean=1.8, reward_bound=0.0
501: loss=0.000, reward_mean=1.9, reward_bound=0.0
502: loss=0.000, reward_mean=1.6, reward_bound=0.0
503: loss=0.000, reward_mean=2.1, reward_bound=0.0
504: loss=0.000, reward_mean=2.7, reward_bound=0.0
505: loss=0.000, reward_mean=2.

KeyboardInterrupt: 

In [None]:
log_reward_mean, log_loss