In [1]:
from IPython.display import clear_output
!pip install numpy==1.25.2
!pip install torch==2.7.0
!pip install torchvision==0.22.0
!pip install gym_super_mario_bros==7.4.0
!pip install gym==0.26.2
!pip install nes_py==8.2.1
!pip install gymnasium==0.29.1
clear_output()

In [2]:
import random
import warnings
warnings.filterwarnings("ignore")
import gym_super_mario_bros
from nes_py.wrappers import JoypadSpace
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from gymnasium.wrappers import FrameStack
import numpy as np
import torch
from PIL import Image
from torchvision.transforms import v2,Resize
from torchvision.transforms.functional import to_tensor

# hypers 
lr = 0.0003
_lambda_ = 0.99
gamma = 0.99
epsilon = 0.2
c1 = 0.5
numFrames = 5 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = gym_super_mario_bros.make('SuperMarioBros-v0',apply_api_compatibility=True)
env = JoypadSpace(env, SIMPLE_MOVEMENT)
env = FrameStack(env,numFrames)

def transform_env_output(observation): # -> torch.Size([1, 5, 150, 150])
    _list_ = []
    for element in observation:
        tonumpy = np.array(element)
        topil = Image.fromarray(tonumpy)
        gray = to_tensor(v2.Grayscale(1)(topil))
        resized = Resize((150,150))(gray)
        _list_.append(resized)
    return torch.stack(_list_,dim=0).permute(1,0,2,3)

In [8]:
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

class network(nn.Module):
    def __init__(self):
        super().__init__()
        self.input = nn.LazyConv2d(1,(1,1))
        self.conv1 = nn.LazyConv2d(1,(3,3),stride=2)
        self.conv2 = nn.LazyConv2d(1,(3,3),stride=1)
        self.conv3 = nn.LazyConv2d(1,(3,3),stride=2)
        
        self.linear1 = nn.LazyLinear(3000)
        self.linear2 = nn.LazyLinear(1500)
        self.linear3 = nn.LazyLinear(750)
        self.linear4 = nn.LazyLinear(375)

        self.policyHead = nn.LazyLinear(7)
        self.valueHead = nn.LazyLinear(1)

        self.optim = Adam(self.parameters(),lr=lr)
        
    def forward(self,x):
        x = F.relu(self.input(x))
        x = self.conv1(x)
        x = F.relu(self.conv2(x))
        x = self.conv3(x)
        x = F.relu(torch.flatten(x,start_dim=1))
        x = self.linear1(x)
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        x = F.relu(self.linear4(x))
        policyOut = self.policyHead(x)
        valueOut = self.valueHead(x)
        return F.softmax(policyOut,-1),valueOut

model = network() 
model.forward(torch.rand((1,5,150,150),dtype=torch.float))
model.to(device)
model.load_state_dict(torch.load("/kaggle/input/mario800k/pytorch/default/1/mario800"),strict=False)
clear_output()

In [10]:
from torch.distributions import Categorical
import gc

class Memory:
    def __init__(self):
        self.network = model
        self.env = env
        self.gamma = gamma
        self._lambda_ = _lambda_
        self.data = []
        self.pointer = 0
    
    def rollout(self,batchsize):
        self.clear()
        stacked_frames,info = self.env.reset()
        with torch.no_grad(): 
            for _ in range(batchsize):  
                tranformed_observation = transform_env_output(stacked_frames).to(device)
                policy_output , value = self.network.forward(tranformed_observation)
                distribution = Categorical(policy_output)
                action = distribution.sample()
                prob = _distribution.log_prob(action)
                state,reward,done,_,_ = self.env.step(action.item())
                if done:
                    stacked_frames,info = self.env.reset() # reset env if done to keep training
                self.data.append([state,torch.tensor(reward),value,prob,action,done])

        _,rewards,values,_,_,_ = zip(*self.data) # compute advantages 
        _rewards = torch.stack(rewards).to(device)
        _values = torch.stack(values).reshape(batchsize).to(device)
        _values = torch.cat((_values,torch.tensor([0],device=device)))
        
        n = torch.arange(batchsize,device=device)
        temporalDifferences  = _rewards[n] + self.gamma*_values[n+1] - _values[n]
        temporalDifferences = torch.flip(temporalDifferences,dims=[-1])
        _advantage = 0
        advantages = temporalDifferences[n] + (self._lambda_ * self.gamma * _advantage)
        advantages = torch.flip(advantages,dims=[-1]) 
      
        for data,item in zip(self.data,advantages): # append advantages to data
            data.append(item)
        random.shuffle(self.data) 

    def sample(self,number):
        output = self.data[self.pointer:self.pointer+number]
        self.pointer+=number
        states,rewards,values,logProb,actions,done,advantages = zip(*output)
        return states,actions,rewards,values,logProb,advantages,done

    def clear(self):
        self.data = []
        self.pointer = 0
        gc.collect()
        torch.cuda.empty_cache()
 

In [14]:
torch.autograd.set_detect_anomaly(True)
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import sys
import warnings
warnings.filterwarnings("ignore")

class agent:
    def __init__(self):
        self.env = env
        self.memory = Memory()
        self.network = model
        self.epochs = 2_000
        self.batchsize = 2_700
        self.minibatch = 450
        self.optim_step = 10
        self.writter = SummaryWriter("/kaggle/working/")

    def save(self,k):
        torch.save(self.network.state_dict(),f"/kaggle/working/mario{k}")
  
    def train(self):
        for traj in tqdm(range(self.epochs),total=self.epochs):
            self.memory.rollout(self.batchsize)
            for n in range(self.batchsize//self.minibatch): # sample minibatches
                states,actions,rewards,values,old_log_prob,advantages,done = self.memory.sample(self.minibatch)    
                for m in range(self.optim_step): # many passes on minibatches
                    _advantages = torch.stack(advantages)
                    _values = torch.stack(values,dim=-1)
                    _old_log_prob = torch.stack(old_log_prob,dim=1)
                    _rewards = torch.mean(torch.stack(rewards)) 
                    
                    vtarget = _advantages + _values
                    loss_critic = F.mse_loss(_values,vtarget)
                   
                    states_list = []
                    for element in states:  
                        transformed_states = transform_env_output(element).to(device)
                        states_list.append(transformed_states)
                    stacked_states = torch.stack(states_list,dim=0).squeeze(1) # ->  torch.Size([minibatch, 5, 150, 150])
                    stacked_actions = torch.stack(actions,dim=1) 
                    
                    policy_output,_ = self.network.forward(stacked_states)
                    dist = Categorical(policy_output)
                    new_log_prob = dist.log_prob(stacked_actions)
                    
                    ratio = torch.exp(new_log_prob)/torch.exp(_old_log_prob)
    
                    loss_policy_list = [] 
                    for i in range(len(advantages)):
                        ratio_advantages = ratio*_advantages[i]
                        clipped_ratio_advantages = torch.clamp(ratio_advantages,(1-epsilon),(1+epsilon))*_advantages[i]
                        loss = torch.min(ratio_advantages,clipped_ratio_advantages)
                        loss_policy_list.append(loss)
    
                    loss_policy = -torch.mean(torch.stack(loss_policy_list))
                    totalLoss = loss_policy + c1*loss_critic
                    self.network.optim.zero_grad()
                    totalLoss.backward(retain_graph=True)
                    self.network.optim.step()
    
            self.writter.add_scalar("main/Reward",_rewards)
            self.writter.add_scalar("main/Loss",totalLoss)
            
            if traj % 100 == 0 : # save every 200k steps
                self.save(traj)

        self.save("FULLYTRAINED")

In [None]:
agent().train()

  1%|▏         | 26/2000 [1:35:24<120:34:22, 219.89s/it]