<a href="https://colab.research.google.com/github/anushkavijay/anushkavijay.github.io/blob/main/rnn_ml_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import gzip
import pickle
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import numpy as np
import os
import gc

In [None]:
class SortingDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.lengths = [len(seq) for seq in data]  # Store lengths of sequences
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = torch.tensor(self.data[idx], dtype=torch.float32).contiguous()
        label = torch.tensor(self.labels[idx], dtype=torch.long).contiguous()
        length = self.lengths[idx]  # Length of the sequence
        return sample, label, length

In [None]:
class SortingLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(SortingLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def forward(self, x, lengths):
        # Pack the padded sequence
        # packed_input = pack_padded_sequence(x.contiguous(), lengths, batch_first=True, enforce_sorted=False).to(self.device)
        # print("FORWAAAARD")
        if torch.isnan(x).any() or torch.isinf(x).any():
          print("Input tensor contains NaNs or Infs!")

        # print(f"input: {x}")
        packed_input = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False).to(self.device)         # Initialize hidden state and cell state
        # print(x.size())
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device).contiguous()
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device).contiguous()

        # Forward pass through LSTM
        # print("is c0 contiguous")
        # print(c0.is_contiguous())
        packed_output, _ = self.lstm(packed_input, (h0, c0))
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        output = output.to(self.device) # move the unpacked output to the correct device


        # Use the output of the last valid time step for each sequence
        out = output[torch.arange(output.size(0)), lengths - 1] # use unpacked output to access hidden state

        # Fully connected layer
        out = self.fc(out)
        return out

In [None]:
def collate_fn(batch):
    # Separate sequences, labels, and lengths
    sequences, labels, lengths = zip(*batch)

    # Pad sequences to the maximum length in the batch
    padded_sequences = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True)

    # Stack labels and lengths
    labels = torch.stack(labels)
    lengths = torch.tensor(lengths)

    return padded_sequences, labels, lengths

def read_array(file_path):
    with gzip.open(file_path, 'rb') as f:
        array = pickle.loads(f.read())
    return array

In [None]:
def get_data():
  input_data = []
  input_data += read_array('/content/drive/My Drive/CS/CS 263/cs263_ml/data/small_uniform_arrays.gz')
  input_data += read_array('/content/drive/My Drive/CS/CS 263/cs263_ml/data/small_poisson_arrays.gz')
  input_data += read_array('/content/drive/My Drive/CS/CS 263/cs263_ml/data/small_normal_arrays.gz')
  input_data += read_array('/content/drive/My Drive/CS/CS 263/cs263_ml/data/small_reverse_arrays.gz')
  input_data += read_array('/content/drive/My Drive/CS/CS 263/cs263_ml/data/small_nearly_sorted_arrays.gz')
  return input_data

In [None]:
# data = read_array('/content/drive/My Drive/CS/CS 263/cs263_ml/data/uniform_arrays.gz')
# input_data = []
# indices = []
# for i,el in enumerate(data):
#   if len(el) < 10000:
#     input_data.append(el)
#     indices.append(i)

input_data = get_data()

In [None]:
df = pd.read_csv('/content/drive/My Drive/CS/CS 263/cs263_ml/data/small_training_runtime_only_data.csv')
total_labels = df['Best Algorithm'].to_numpy()

In [None]:
print(len(total_labels))
print(len(input_data))

25000
25000


In [None]:
# Create dataset and dataloader

train_ratio = 0.8
test_ratio = 0.1
validation_ratio = 0.1

dataset = SortingDataset(input_data, total_labels)

# split input data and labels
train_size = int(train_ratio*len(dataset))
val_size = int(validation_ratio*len(dataset))
test_size = int(test_ratio*len(dataset))

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create dataset and dataloader
batch_size = 200
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
# Hyperparameters
os.environ['CUDA_LAUNCH_BLOCKING']="1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"
input_size = 1  # Since each element in the array is a single integer
hidden_size = 64
num_layers = 2
num_classes = 4
learning_rate = 0.001
num_epochs = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize the model
model = SortingLSTM(input_size, hidden_size, num_layers, num_classes).to(device)

In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.02)

In [None]:
counter = 0
model.train()
for epoch in range(1):
    for sequences, labels, lengths in train_dataloader:
        # Forward pass
        sequences = sequences.to(device)
        lengths = lengths.to(device)
        outputs = model(sequences.unsqueeze(-1), lengths).to(device)
        labels = labels.to(device)
        # print(f"size: outputs: {len(outputs)}, labels: {len(labels)}")
        # print("testing target: ")
        # print("outputs: ", outputs)
        # print("labels: ", labels)
        loss = criterion(outputs, labels)
        if counter % 100 == 0:
          print(counter)
        counter += 1

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    with torch.no_grad():
      for sequences, labels, lengths in val_dataloader:
          outputs = model(sequences.unsqueeze(-1), lengths).to(device)
          labels = labels.to(device)
          val_loss = criterion(outputs, labels)

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Validation Loss: {val_loss.item():.4f}')

0
Epoch [1/5], Train Loss: 0.6411, Validation Loss: 0.5190


In [None]:
# Test the model

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for sequences, labels, lengths in test_dataloader:
        labels = labels.to(device)
        outputs = model(sequences.unsqueeze(-1), lengths).to(device)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Test Accuracy: {100 * correct / total:.2f}%')

Test Accuracy: 74.84%


In [None]:
torch.save(model.state_dict(), '/content/drive/My Drive/CS/CS 263/cs263_ml/data/rnn_runtime_only.pt')