In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
#sys.path.append('')

In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer

if not torch.cuda.is_available():
  warnings.warn('CUDA is not available.')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Data Preparation

 __Data Paths__

In [4]:
root = '/network/scratch/s/subhrajyoti.dasgupta/yc2/'
root2 = '/network/scratch/t/tejas.kasetty/yc2/'
data_file = 'reviewed_0812_with_clip_path.csv'
bert_features_file = 'bert_features.pt'
text_features_file = 'text_features_512.pt'
video_features_file = 'video_features_512.pt'

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

### Load data

In [6]:
file_path = os.path.join(root, data_file)
data = pd.read_csv(file_path)
print(data.columns)

Index(['Unnamed: 0', 'No', 'Title', 'VideoUrl', 'TimeStamp', 'Sentence',
       'RowNumber', 'IsUsefulSentence', 'Key steps', 'Verb',
       'Object(directly related with Verb)', 'Location', 'Time', 'Temperature',
       'Other important phrase(like with', 'Video Pred', 'Clip IDs',
       'clip_path'],
      dtype='object')


#### Text Data

In [7]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [8]:
# Text data
all_text, all_labels = data["Sentence"].to_numpy(), data["IsUsefulSentence"].to_numpy()
all_text, all_labels

(array(['guys , jason hill here today.',
        "and i 'm with chef great stillman at repor restaurant in rancho cucamonga, and he 's going to make up a discourse today.",
        'what are we going to have?', ...,
        'several allrecipes members suggested adding one teaspoon of onion powder, one teaspoon of garlic powder and half a teaspoon of chili powder to the seasonings for a little extra zinc.',
        'anyway , you season it.',
        'golden brown southern fried chicken is a winner.'], dtype=object),
 array([0, 0, 0, ..., 0, 1, 0]))

In [9]:
#tokenize and convert to a tensor
encodings = tokenizer(list(all_text), truncation=True, padding=True)
input_ids = torch.tensor(encodings['input_ids']) #.to(device)
attention_mask = torch.tensor(encodings['attention_mask']) #.to(device)

In [10]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_mask) 
batch_size = 500
dataloader = DataLoader(dataset, batch_size = batch_size)

 __BERT feature extraction__

In [25]:
import time
import datetime
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizer, DistilBertModel, DistilBertConfig


def format_time():
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
model = DistilBertModel.from_pretrained("distilbert-base-uncased")
output = []
start = time.time()
for step, batch, in enumerate(dataloader):
    with torch.no_grad():
        model_output = model(batch[0], attention_mask=batch[1])
        features = model_output.last_hidden_state[:, 0, :]
        output.append(features)
    elapsed = format_time(time.time() - start)
    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(dataloader), elapsed))
    

features = torch.cat(output, axis = 0)
bert_features_file = 'bert_features.pt'
torch.save(features, os.path.join(root, bert_features_file))
features.shape

In [None]:
# Load original bert features(:, 768)
features = torch.load(os.path.join(root, bert_features_file))

__Dimensionality reduction__

In [92]:
import numpy as np
from sklearn.decomposition import PCA

# Perform PCA to reduce the dimension of the last hidden state
pca = PCA(n_components=512)
text_features = pca.fit_transform(features.numpy())
text_features = torch.from_numpy(text_features)

torch.save(text_features, os.path.join(root, text_features_file))
print(text_features.shape)

torch.Size([15511, 512])


In [10]:
# Load text_features(:, 512)
text_features = torch.load(os.path.join(root2, text_features_file))

#### Video Data

In [93]:
#Video data
def get_feature_sample(clip_path, size = 10):
    clip = torch.load(os.path.join(root, clip_path))
    no_rows = clip.size(0)
    if no_rows < size:
        diff = size - no_rows
        idx = torch.randint(no_rows, size = (diff,), dtype = torch.long)
        clip_sample = torch.cat((clip, clip[idx]))
    else:
        perm = torch.randperm(no_rows)
        idx = perm[:size]
        clip_sample = clip[idx]
    
    return clip_sample

In [None]:
clips = [ get_feature_sample(clip_path) for clip_path in data['clip_path'] ]
for i in clips: assert 10 == len(i) #ensure each clip has the same sample size
video_features = torch.stack(clips).float()
torch.save(video_features, os.path.join(root, video_features_file)) # save the video features as tensor pickle.
video_features.shape

torch.Size([15511, 10, 512])

In [11]:
# Load video_features(:, 10, 512)
video_features = torch.load(os.path.join(root2, video_features_file))

#### Labels

In [12]:
labels = torch.tensor(all_labels[:, None]).float() #.to(device)

#### Text + Video

In [13]:
x, s = video_features, text_features
attn = torch.matmul(x, s[..., None])
attn = torch.softmax(attn, dim = 1)
attn_x = torch.sum(attn * x, axis = 1)
data = attn_x * s
data.shape

torch.Size([15511, 512])

### Train-Validation split

In [14]:
data_size = data.size(0)
label_size = labels.size(0)
assert data_size == label_size

__Dataset__

In [15]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(data, labels)

# Calculate the number of samples to include in each set.
train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

10,857 training samples
4,654 validation samples


__DataLoader__

In [32]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32
train_dataloader = DataLoader(train_dataset, 
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size)

valid_dataloader = DataLoader(val_dataset,
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size)

## Neural Based Selection 

In [27]:
import torch.nn as nn

class NeuralSelection(nn.Module):
    def __init__(self, in_features: int, n_layers = 2):
        super().__init__()
        self.softmax = nn.Softmax(dim=1)
        self.seq = nn.Sequential()
        self.seq.add_module(f"dense_1", nn.Linear(in_features, in_features // 2))
        self.seq.add_module(f"act_1", nn.ReLU())
        self.seq.add_module(f"dense_2", nn.Linear(in_features // 2, in_features // 4))
        self.seq.add_module(f"act_2", nn.ReLU())
        self.seq.add_module("output", nn.Linear(in_features // 4, 1))
        self.seq.add_module("outact", nn.Sigmoid())
    
    def forward(self, x):
        output = self.seq(x)
        return output
        

In [30]:

import os
import time
import json
from tqdm import tqdm
from torch import optim
import torch.nn as nn

def seed_experiment(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = True

def compute_accuracy(preds: torch.Tensor, labels: torch.Tensor):
    """ Compute the accuracy of the batch """
    acc = (torch.round(preds)[:, 0] == labels[:, 0]).float().mean()
    return acc


def train(epoch, model, dataloader, optimizer, log_freq, device):
    model.train()
    total_iters = 0
    epoch_accuracy=0
    epoch_loss=0
    start_time = time.time()
    bce_loss = nn.BCELoss()
    for idx, batch in enumerate(dataloader):
        optimizer.zero_grad()
        #features, labels = batch[0].to(device), batch[1].to(device)
        features, labels = batch
        preds = model(features)
        loss = bce_loss(preds, labels)
        acc = compute_accuracy(preds, labels)
        loss.backward()
        optimizer.step()
        epoch_accuracy += acc.item() / len(dataloader)
        epoch_loss += loss.item() / len(dataloader)
        total_iters += 1

        if idx % log_freq == 0:
            tqdm.write(f"[TRAIN] Epoch: {epoch}, Iter: {idx}, Loss: {loss.item():.5f}")
    tqdm.write(f"== [TRAIN] Epoch: {epoch}, Accuracy: {epoch_accuracy:.3f} ==>")
    return epoch_loss, epoch_accuracy, time.time() - start_time


def evaluate(epoch, model, dataloader, log_freq, device, mode="val"):
    model.eval()
    epoch_accuracy=0
    epoch_loss=0
    total_iters = 0
    start_time = time.time()
    bce_loss = nn.BCELoss()
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            #features, labels = batch[0].to(device), batch[1].to(device)
            features, labels =  batch
            preds = model(features)
            loss = bce_loss(preds, labels)
            acc = compute_accuracy(preds, labels)
            epoch_accuracy += acc.item() / len(dataloader)
            epoch_loss += loss.item() / len(dataloader)
            total_iters += 1
            if idx % log_freq == 0:
                tqdm.write(
                    f"[{mode.upper()}] Epoch: {epoch}, Iter: {idx}, Loss: {loss.item():.5f}"
                )
        tqdm.write(
            f"=== [{mode.upper()}] Epoch: {epoch}, Iter: {idx}, Accuracy: {epoch_accuracy:.3f} ===>"
        )
    return epoch_loss, epoch_accuracy, time.time() - start_time

In [42]:
import time
def predict(model: torch.nn.Module, test_data_loader: DataLoader):
    model.eval()
    model.to(device)
    predictions = []
    target = []
    with torch.no_grad():
        for step, batch in enumerate(test_data_loader):
            features, labels =  batch[0].to(device), batch[1].to(device)
            pred = model(features)
            pred = torch.round(pred)[:, 0]
            predictions.append(pred)
            target.append(labels)
    predictions = torch.cat(predictions).detach().cpu().numpy()
    target = torch.cat(target).detach().cpu().numpy()
    return predictions, target

#### Train model

In [34]:
epochs = 10
model = NeuralSelection(512)
lr = 1e-3
optimizer = optim.Adam(model.parameters(), lr=lr)

train_losses, valid_losses = [], []
train_accs, valid_accs = [], []
train_times, valid_times = [], []
for epoch in range(epochs):
    tqdm.write(f"====== Epoch {epoch} ======>")
    loss, acc, wall_time = train(epoch, model, train_dataloader, optimizer, 25, device)
    train_losses.append(loss)
    train_accs.append(acc)
    train_times.append(wall_time)

    loss, acc, wall_time = evaluate(epoch, model, valid_dataloader, 25, device)
    valid_losses.append(loss)
    valid_accs.append(acc)
    valid_times.append(wall_time)

#test_loss, test_acc, test_time = evaluate(epoch, model, test_dataloader, args, mode="test")
print(f"===== Best validation Accuracy: {max(valid_accs):.3f} =====>")

[TRAIN] Epoch: 0, Iter: 0, Loss: 0.68448
[TRAIN] Epoch: 0, Iter: 25, Loss: 0.55269
[TRAIN] Epoch: 0, Iter: 50, Loss: 0.36601
[TRAIN] Epoch: 0, Iter: 75, Loss: 0.61262
[TRAIN] Epoch: 0, Iter: 100, Loss: 0.37845
[TRAIN] Epoch: 0, Iter: 125, Loss: 0.57039
[TRAIN] Epoch: 0, Iter: 150, Loss: 0.51876
[TRAIN] Epoch: 0, Iter: 175, Loss: 0.35922
[TRAIN] Epoch: 0, Iter: 200, Loss: 0.52635
[TRAIN] Epoch: 0, Iter: 225, Loss: 0.25457
[TRAIN] Epoch: 0, Iter: 250, Loss: 0.28933
[TRAIN] Epoch: 0, Iter: 275, Loss: 0.40860
[TRAIN] Epoch: 0, Iter: 300, Loss: 0.25126
[TRAIN] Epoch: 0, Iter: 325, Loss: 0.53634
== [TRAIN] Epoch: 0, Accuracy: 0.799 ==>
[VAL] Epoch: 0, Iter: 0, Loss: 0.43223
[VAL] Epoch: 0, Iter: 25, Loss: 0.42999
[VAL] Epoch: 0, Iter: 50, Loss: 0.34632
[VAL] Epoch: 0, Iter: 75, Loss: 0.43423
[VAL] Epoch: 0, Iter: 100, Loss: 0.32761
[VAL] Epoch: 0, Iter: 125, Loss: 0.23321
=== [VAL] Epoch: 0, Iter: 145, Accuracy: 0.813 ===>
[TRAIN] Epoch: 1, Iter: 0, Loss: 0.28994
[TRAIN] Epoch: 1, Iter: 25, 

NameError: name 'test_dataloader' is not defined

In [43]:
pred, target = predict(model, valid_dataloader)

### Performance 

In [44]:
from sklearn.metrics import classification_report
print(classification_report(target, pred))

              precision    recall  f1-score   support

         0.0       0.85      0.90      0.87      3555
         1.0       0.60      0.47      0.53      1099

    accuracy                           0.80      4654
   macro avg       0.72      0.69      0.70      4654
weighted avg       0.79      0.80      0.79      4654

