In [3]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd
from torch.utils.data import random_split

In [14]:
data = torch.load('uniform_D2_features_data_torch')
d_all = pd.read_pickle("./train_val/D2_train.pkl")
data_labels = d_all["verb_class"]

In [15]:
class dataset_lstm(Dataset):
    def __init__(self,sample,lable):
        self.lable = torch.tensor(lable,dtype=torch.int64)
        self.sample = sample.double()
        self.len = sample.shape[2]
        self.n_clips = sample.shape[0]
        self.n_features = sample.shape[1]
    
    def __getitem__(self, index):
        return self.sample[:,:,index],self.lable[index]
    
    def __len__(self):
        return self.len
    

In [16]:
df = dataset_lstm(data,data_labels)
train_dataset, test_dataset = random_split(df, lengths=[0.8,0.2])

train_loader = DataLoader(train_dataset,shuffle=True,batch_size=128)
test_loader = DataLoader(test_dataset,shuffle=True,batch_size=128)

In [17]:
#Selecting device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
for i, (samples, labels) in enumerate(train_loader):  
    print(samples)

tensor([[[0.0582, 0.3670, 0.0434,  ..., 0.2789, 0.1930, 0.1242],
         [0.2875, 0.0716, 0.0348,  ..., 0.0171, 0.1645, 0.4186],
         [0.4072, 0.0371, 0.0453,  ..., 0.0868, 0.1495, 0.4683],
         [0.2390, 0.0430, 0.0790,  ..., 0.0403, 0.1920, 0.4476],
         [0.0786, 0.7926, 0.1166,  ..., 0.0631, 0.3078, 0.1961]],

        [[0.2089, 0.2199, 0.1693,  ..., 0.1198, 0.1842, 0.1755],
         [0.2065, 0.3813, 0.2901,  ..., 0.2047, 0.1693, 0.1343],
         [0.1380, 0.1510, 0.3241,  ..., 0.4758, 0.1845, 0.1651],
         [0.0794, 0.2681, 0.1935,  ..., 0.0924, 0.2775, 0.0713],
         [0.1520, 0.0877, 0.4070,  ..., 0.0802, 0.6145, 0.1341]],

        [[0.2349, 0.2887, 0.2074,  ..., 0.0029, 0.4921, 0.3162],
         [0.2765, 0.4881, 0.3036,  ..., 0.0154, 0.1914, 0.2491],
         [0.1552, 0.0824, 0.2568,  ..., 0.0907, 0.2159, 0.5461],
         [0.2654, 0.2786, 0.2745,  ..., 0.0105, 0.3731, 0.2147],
         [0.1595, 0.0863, 0.1983,  ..., 0.0065, 0.5794, 0.2272]],

        ...,

     

In [18]:
# Hyper parameters 
input_size = 1024 #so the length of every clip after feature extraction
sequence_length  = 5 # the number of clips
num_layers = 2
hidden_size = 128 #try different values
num_classes = 8 #different verb classes for D2
num_epochs = 2 
# batch_size = how many samples the network sees before it upudates
batch_size = 128
learning_rate = 0.001

In [25]:

# many to one structure
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        # -> input x needs to have this shape: (batch_size, seq, input_size)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dtype=torch.double)
        self.fc = nn.Linear(hidden_size, num_classes, dtype=torch.double)
        
    def forward(self, x):
        # Set initial hidden states (and cell states for LSTM)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, dtype=torch.double).to(device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, dtype=torch.double).to(device) 
        
        # x: (128, 5, 1024), h0: (2, n, 128)
        
        # Forward propagate lstm
  
        out, _ = self.lstm(x, (h0,c0))  
        
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        # out: (128, 1024, 128)
        
        # Decode the hidden state of the last time step
        out = out[:, -1, :]
        # out: (n, 128)
         
        out = self.fc(out)
        # out: (n, 8)
        return out

model = LSTM(input_size, hidden_size, num_layers, num_classes).to(device)


In [23]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

In [27]:
# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (samples, labels) in enumerate(train_loader):  

        # shape: [batch size, seq_length, input_size]
        # shape_original: [5, 1024, n_samples]
        samples = samples.to(device)
        labels = labels.to(device)
        
        # Forward pass
        print(samples.shape)  # [128, 5, 1024]
        outputs = model(samples)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([76, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([128, 5, 1024])
torch.Size([76, 5, 1024])


In [28]:



# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for samples, labels in test_loader:
        samples = samples.to(device)
        labels = labels.to(device)
        outputs = model(samples)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the test images: {acc} %')

Accuracy of the network on the test images: 8.216432865731463 %
