In [1]:
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [None]:
print(torch.is_gpu_)

In [3]:

# Read the pickle object
with open("../../dataset/pickles/LOC3.pickle", "rb") as f:
    data = pickle.load(f)

# Convert the data to a pandas dataframe
df = pd.DataFrame(data)

# Encode the class labels
label_encoder = LabelEncoder()
df['class_label'] = label_encoder.fit_transform(df['class_label'])

# Define the maximum sequence length
max_len = 0
for i in df['lengths']:
    max_len=max(max_len,len(i))

# Pad the sequences to the maximum length
sequences = np.array(df['lengths'])
padded_sequences = np.zeros((len(sequences), max_len))
for i, sequence in enumerate(sequences):
    padded_sequences[i, :len(sequence)] = sequence

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, df['class_label'], test_size=0.2)

# Convert the data to PyTorch tensors
X_train = torch.from_numpy(X_train).to(torch.float32)
y_train = torch.from_numpy(y_train.values).long()
X_val = torch.from_numpy(X_val).to(torch.float32)
y_val = torch.from_numpy(y_val.values).long()


In [8]:

# Initialize the model
input_dim=0
for i in X_train:
    input_dim = max(input_dim,len(i))
# input_dim = len(X_train[0])
hidden_dim = 128
output_dim = 1500
learning_rate = 1e-3
print(input_dim)
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
# Define the dataloaders
train_dataset = SequenceDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataset = SequenceDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


1213


Using device: cuda


In [66]:

# Define a custom dataset
class SequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        sequence = self.sequences[index]
        label = self.labels[index]
        return sequence, label

# Define the model architecture
class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, batch_size):
        super(RNN, self).__init__()
        self.batch_size=batch_size
        # simple 1d conv 
        self.conv1 = nn.Conv1d(1, 8, 3, padding=1)
        self.bn1 = nn.BatchNorm1d(8)
        self.relu = nn.ReLU()        
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(8*input_dim, output_dim)
        
    def forward(self, x):
        # print(x.shape)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.flatten(x)
        # print("Size of FC: ", x.shape)
        return self.fc(x)
        embedded = self.embedding(x)
        output, (hidden, cell) = self.rnn(embedded)
        output = self.fc(hidden[-1])
        return output



In [71]:
device = torch.device('cuda:6' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
rnn = RNN(input_dim, hidden_dim, output_dim, batch_size=64)
rnn = rnn.to(device)
criterion = criterion.to(device)

optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)


Using device: cuda:6


In [74]:
num_epochs = 100
# optimizer = optimizer.to(device)
for epoch in range(num_epochs):
    running_loss = 0.0
    running_corrects = 0
    rnn.train()
    for sequences, labels in train_loader:
        optimizer.zero_grad()
        labels = labels.to(device)
        sequences = sequences.to(device)
        sequences = sequences.unsqueeze(1)
        outputs = rnn(sequences)
        
        # print(sequences.shape)
        # print(outputs.shape)
        # print(labels.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * sequences.size(0)
        _, preds = torch.max(outputs, 1)
        running_corrects += torch.sum(preds == labels)
    epoch_loss = running_loss / len(train_dataset)
    epoch_acc = running_corrects.double() /len(train_dataset)
    print('Train Loss: {:.4f} Train Acc: {:.4f} %'.format(epoch_loss, epoch_acc*100))
    
    # test the model
    rnn.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for sequences, labels in val_loader:
            sequences = sequences.to(device)
            labels = labels.to(device)
            sequences = sequences.unsqueeze(1)
            outputs = rnn(sequences)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        print('Test Accuracy of the model on the test sequences: {} %'.format(100 * correct / total))



Train Loss: 0.6663 Train Acc: 0.8241
Test Accuracy of the model on the test sequences: 70.6747991093039 %
Train Loss: 0.6593 Train Acc: 0.8249
Test Accuracy of the model on the test sequences: 70.77161390260432 %
Train Loss: 0.6503 Train Acc: 0.8286
Test Accuracy of the model on the test sequences: 71.2879594668732 %
Train Loss: 0.6440 Train Acc: 0.8294
Test Accuracy of the model on the test sequences: 70.71352502662407 %
Train Loss: 0.6362 Train Acc: 0.8308
Test Accuracy of the model on the test sequences: 71.36218414173686 %
Train Loss: 0.6323 Train Acc: 0.8310
Test Accuracy of the model on the test sequences: 71.46222609481396 %
Train Loss: 0.6233 Train Acc: 0.8332
Test Accuracy of the model on the test sequences: 71.56226804789105 %
Train Loss: 0.6183 Train Acc: 0.8340
Test Accuracy of the model on the test sequences: 71.7881692322587 %
Train Loss: 0.6128 Train Acc: 0.8357
Test Accuracy of the model on the test sequences: 71.55904088811437 %
Train Loss: 0.6080 Train Acc: 0.8358
Tes

KeyboardInterrupt: 

In [70]:
# Evaluate the model
running_loss = 0.0
running_corrects = 0
rnn.eval()
with torch.no_grad():
    for sequences, labels in val_loader:
        labels = labels.to(device)
        sequences = sequences.to(device)
        sequences = sequences.unsqueeze(1)
        outputs = rnn(sequences)
        loss = criterion(outputs, labels)
        running_loss += loss.item() * sequences.size(0)
        _, preds = torch.max(outputs, 1)
        running_corrects += torch.sum(preds == labels)
epoch_loss = running_loss / len(val_dataset)
epoch_acc = running_corrects.double() / len(val_dataset)
print('Val Loss: {:.4f} Val Acc: {:.4f}'.format(epoch_loss, epoch_acc))


Val Loss: 1.7257 Val Acc: 0.6621


In [None]:
accuracy = model.predict(test_data)
print("Accuracy: ", accuracy,"%")

Accuracy: 90%
