In [1]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

<pyvirtualdisplay.display.Display at 0x7f7dce19cf28>

In [2]:
import minerl
import random
import numpy as np

from collections import deque
from sklearn.cluster import KMeans



In [3]:
class Memory(object):
    def __init__(self, memory_size: int) -> None:
        self.memory_size = memory_size
        self.buffer = deque(maxlen=self.memory_size)

    def add(self, experience) -> None:
        self.buffer.append(experience)

    def size(self):
        return len(self.buffer)

    def sample(self, batch_size: int):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        indexes = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
        return [self.buffer[i] for i in indexes]

    def clear(self):
        self.buffer.clear()

    def save(self, path):
        b = np.asarray(self.buffer)
        print(b.shape)
        np.save(path, b)

    def load(self, path):
        b = np.load(path+'.npy', allow_pickle=True)
#         assert(b.shape[0] == self.memory_size)

        for i in range(b.shape[0]):
            self.add(b[i])

In [4]:
ENVIRONMENT = 'MineRLTreechopVectorObf-v0'
DATA_DIR = "/home/ankitagarg/minerl/data/"
REPLAY_MEMORY = 100000
NUM_OF_CENTROIDS = 64

data = minerl.data.make(ENVIRONMENT, data_dir=DATA_DIR)
expert_memory_replay = Memory(REPLAY_MEMORY)

In [None]:
action_data = []
demonstration = []

trajectory_names = data.get_trajectory_names()
random.shuffle(trajectory_names)
for trajectory_name in trajectory_names:
    trajectory = data.load_data(trajectory_name, skip_interval=0, include_metadata=False)
    for state, action, reward, next_state, done in trajectory:
        action_data.append(action["vector"])
        demonstration.append((state, action, reward, next_state, done))
        if len(demonstration) >= REPLAY_MEMORY:
            break

In [None]:
all_actions = np.array(action_data)

In [None]:
print("Running KMeans on the action vectors")
kmeans = KMeans(n_clusters=NUM_OF_CENTROIDS)
kmeans.fit(action_data)
action_centroids = kmeans.cluster_centers_
print("KMeans done")

In [None]:
np.save('./action_centroids', action_centroids)

In [None]:
for i in range(len(demonstration)):
    action_vector = demonstration[i][1]["vector"]
    distances = np.sum((action_vector - action_centroids[:, None]) ** 2, axis=2)
    action = np.argmin(distances, axis=0)
    expert_memory_replay.add((demonstration[i][0], action, 1, demonstration[i][3], demonstration[i][4]))
    if expert_memory_replay.size() == REPLAY_MEMORY:
        print('buffer memory full')
        break

In [None]:
expert_memory_replay.save('expert_memory_replay')

In [None]:
# trajectory_names = data.get_trajectory_names()
# random.shuffle(trajectory_names)
# expert_memory_replay = Memory(REPLAY_MEMORY)
# for trajectory_name in trajectory_names:
#     trajectory = data.load_data(trajectory_name, skip_interval=0, include_metadata=False)
#     temp = []
#     episode_reward = []
#     for dataset_observation, dataset_action, dataset_rewards, next_states, done in trajectory:
#         action_vector = dataset_action["vector"]
#         distances = np.sum((action_vector - action_centroids[:, None]) ** 2, axis=2)
#         action = np.argmin(distances, axis=0)

#         episode_reward.append(dataset_rewards)
#         temp.append((dataset_observation, action, next_states, dataset_rewards, done))
        
#     if np.sum(episode_reward) > 0:
#         for step in temp:
#             expert_memory_replay.add(step)
#             if expert_memory_replay.size() >= REPLAY_MEMORY:
#                 break
#     if expert_memory_replay.size() >= REPLAY_MEMORY:
#         print('buffer memory full')
#         break