In [1]:
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import jiahao

class NeuralNetwork(nn.Module):
    def __init__(self, array_size, n_classes):
        super(NeuralNetwork, self).__init__()
        self.array_size = array_size

        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Conv2d(16, 32, kernel_size=3, stride=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Flatten(),
        )

        # Calculate the number of units needed in the first fully connected layer
        # based on the output shape of the convolutional layers
        with torch.no_grad():
            test_input = torch.zeros((1, 1, array_size, array_size))
            test_output = self.conv_layers(test_input)
            fc_input_size = test_output.shape[1] + 7
        # +7 because it will be cacantinate by 7
        
        self.fc_layers = nn.Sequential(
            nn.Linear(fc_input_size, 128),
            nn.Linear(128, n_classes),
        )

    def forward(self, inputs):
        image_input, info_input = inputs
        image_input = image_input.unsqueeze(1)  # add a channel dimension to the image input
        x = self.conv_layers(image_input)
        x = torch.cat((x, info_input), dim=1)
        x = self.fc_layers(x)
        return x

In [2]:
import os
import gzip
import pickle
import numpy as np
from torch.utils.data import Dataset, DataLoader
import random
import torch

class MyDataset(Dataset):
    def __init__(self, directory_path, batch_size, train_frac):
        self.paths = []
        self.data = []
        self.batch_size = batch_size
        self.directory_path = directory_path
        self.train_frac = train_frac
        self.zerocount = 0
        self.invalidcount = 0

        # iterate over all files in the directory
        for filename in os.listdir(directory_path):
            if filename.endswith(".pkl.gzip"):
                with gzip.open(os.path.join(self.directory_path, filename), 'rb') as f:
                    data = pickle.load(f)
                    for state, info, action in zip(data["state"], data["info"], data["action"]):
                        self.data.append((state, info, action))
                        if self.num_zero(action):
                            self.zerocount += 1

        random.shuffle(self.data)
    
        # split the data into train and validation sets
        num_train = int(len(self.data) * train_frac)
        self.train_data = self.data[:num_train]
        self.val_data = self.data[num_train:]

    def __getitem__(self, index):
        if index < len(self.train_data):
            state, info, action = self.train_data[index]
        else:
            state, info, action = self.val_data[index - len(self.train_data)]

        return torch.tensor(state), torch.tensor(info), torch.tensor(action)

    def __len__(self):
        if self.batch_size == 1:
            return len(self.train_data) + len(self.val_data)
        else:
            return len(self.train_data) // self.batch_size + len(self.val_data) // self.batch_size

    def num_zero(self, arr):
        if (arr == [0, 0, 0]).all():
            return True
        else:
            return False

# directory containing the dataset files
directory_path = os.path.join(os.getcwd(), "data2")

# Create dataset
dataset = MyDataset(directory_path=directory_path, batch_size=256, train_frac=0.9)

# Create samplers
train_sampler = torch.utils.data.SubsetRandomSampler(range(len(dataset.train_data)))
val_sampler = torch.utils.data.SubsetRandomSampler(range(len(dataset.train_data), len(dataset.data)))

# Create Dataloader
train_dataloader = DataLoader(dataset, batch_size=256, drop_last=True, num_workers=4, pin_memory=True, sampler=train_sampler)
val_dataloader = DataLoader(dataset, batch_size=256, drop_last=True, num_workers=4, pin_memory=True, sampler=val_sampler)

print("Total number of data points:", len(dataset.train_data) + len(dataset.val_data))
print("Number of [0, 0, 0] actions:", dataset.zerocount)

print("Length of train_dataloader:", len(train_dataloader))
print("Length of val_dataloader:", len(val_dataloader))


Total number of data points: 55287
Number of [0, 0, 0] actions: 26097
Length of train_dataloader: 194
Length of val_dataloader: 21


In [3]:
from torch.utils.tensorboard import SummaryWriter
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define the weighted MSE loss function
def weighted_mse_loss(y_pred, y_true):

    nzero_gt = (y_true != 0)
    zero_pred = (y_pred <= 0)
    rweight = (nzero_gt & zero_pred)
    rweight[:, 0].fill_(False)
    rweight= rweight.float()
    rweight[rweight == 1.0] = 1.0
    rweight[rweight == 0.0] = 1.0

    mse_loss = torch.nn.functional.mse_loss(y_pred, y_true, reduction='none')
    loss = torch.mean(rweight * mse_loss)

    return loss

# number of epochs
num_epochs = 30

model = NeuralNetwork(array_size=96, n_classes=3).cuda()
optimizer = optim.AdamW(model.parameters(), lr=0.0005)

# Use the weighted MSE loss function for training
def train_epoch(model, optimizer, dataloader):
    model.train()
    epoch_loss = 0
    for state_batch, info_batch, action_batch in dataloader:
        state_batch = state_batch.to(device)
        info_batch = info_batch.to(device)
        action_batch = action_batch.to(device)
        
        optimizer.zero_grad()
        
        output = model((state_batch.float(), info_batch.float()))

        loss = weighted_mse_loss(output, action_batch)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)

# Use the unweighted MSE loss function for validation
def validate(model, dataloader):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for state_batch, info_batch, action_batch in dataloader:
            state_batch = state_batch.to(device)
            info_batch = info_batch.to(device)
            action_batch = action_batch.to(device)
            output = model((state_batch.float(), info_batch.float()))
            criterion = nn.MSELoss().cuda()
            loss = criterion(output, action_batch)

            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


# Initialize a SummaryWriter object
writer = SummaryWriter()

# Train the model
for epoch in range(num_epochs):
    train_loss = train_epoch(model, optimizer, train_dataloader)
    val_loss = validate(model, val_dataloader)
    
    # Record the train and validation losses in Tensorboard
    writer.add_scalar('Loss/train', train_loss, epoch)
    writer.add_scalar('Loss/validation', val_loss, epoch)
    
    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.6f} | Val Loss: {val_loss:.6f}")

PATH = "new_model/my_model.pt"
torch.save(model.state_dict(), PATH)
print("model saved")

# Close the SummaryWriter
writer.close()

2023-03-14 00:02:49.568081: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


Epoch 1 | Train Loss: 1.966263 | Val Loss: 0.037552
Epoch 2 | Train Loss: 0.046520 | Val Loss: 0.024829
Epoch 3 | Train Loss: 0.040071 | Val Loss: 0.023039
Epoch 4 | Train Loss: 0.047550 | Val Loss: 0.028714
Epoch 5 | Train Loss: 0.043862 | Val Loss: 0.019006
Epoch 6 | Train Loss: 0.033256 | Val Loss: 0.026171
Epoch 7 | Train Loss: 0.039419 | Val Loss: 0.019388
Epoch 8 | Train Loss: 0.035144 | Val Loss: 0.020118
Epoch 9 | Train Loss: 0.033327 | Val Loss: 0.026032
Epoch 10 | Train Loss: 0.033560 | Val Loss: 0.019311
Epoch 11 | Train Loss: 0.030186 | Val Loss: 0.020234
Epoch 12 | Train Loss: 0.028407 | Val Loss: 0.018500
Epoch 13 | Train Loss: 0.028312 | Val Loss: 0.018630
Epoch 14 | Train Loss: 0.025326 | Val Loss: 0.018299
Epoch 15 | Train Loss: 0.023514 | Val Loss: 0.017334
Epoch 16 | Train Loss: 0.024069 | Val Loss: 0.021302
Epoch 17 | Train Loss: 0.023963 | Val Loss: 0.023747
Epoch 18 | Train Loss: 0.022134 | Val Loss: 0.018540
Epoch 19 | Train Loss: 0.021641 | Val Loss: 0.016613
Ep

In [4]:
!tensorboard --logdir=runs

2023-03-14 00:04:16.027917: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.11.2 at http://localhost:6006/ (Press CTRL+C to quit)
^C
