In [1]:
import pandas as pd
from transformers import AutoTokenizer

In [2]:
import torch

In [3]:
data = pd.read_csv("../youcook2/reviewed_0812_pred.csv")

In [4]:
# data["Sentence"].head()

In [5]:
# len(data["VideoUrl"].unique())

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
def tokenize_text(text):
    return tokenizer(text, truncation=True)

In [8]:
b = tokenize_text(data["Sentence"][0])

In [9]:
import json

In [10]:
with open("../youcook2/2.cooking_vocab_filtered_captions.tmp.json", "rb") as f:
    full_data = json.load(f)

In [11]:
yt_id, split = zip(*[(x["youtube_id"], x["partition"]) for x in full_data])

In [12]:
import numpy as np

In [13]:
from collections import Counter

In [14]:
# data["yt_id"] = data["VideoUrl"].apply(lambda x: x.split('watch?v=')[1])

In [15]:
# data_yt_ids = data["yt_id"].unique()

In [16]:
# len(yt_id), len(data_yt_ids)

In [17]:
seed = 23456
np.random.seed(seed)

In [18]:
data.columns

Index(['No', 'Title', 'VideoUrl', 'TimeStamp', 'Sentence', 'RowNumber',
       'IsUsefulSentence', 'Key steps', 'Verb',
       'Object(directly related with Verb)', 'Location', 'Time', 'Temperature',
       'Other important phrase(like with', 'PredUseful', 'PredVerbs',
       'PredArgs'],
      dtype='object')

In [19]:
train_len = int(0.70*len(data))

In [20]:
indices = list(range(len(data)))
np.random.shuffle(indices)

In [21]:
all_text, all_labels = data["Sentence"].to_numpy(), data["IsUsefulSentence"].to_numpy()

In [22]:
train_data, train_labels = all_text[indices[:train_len]], all_labels[indices[:train_len]]


In [23]:
val_data, val_labels = all_text[indices[train_len:]], all_labels[indices[train_len:]]

In [24]:
train_encodings = tokenizer(list(train_data), truncation=True, padding=True)
val_encodings = tokenizer(list(val_data), truncation=True,padding=True)

In [25]:
import torch

class YouCookData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [26]:
train_dataset = YouCookData(train_encodings, list(train_labels))
val_dataset = YouCookData(val_encodings, list(val_labels))

In [88]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [48]:
import evaluate
accuracy = evaluate.load("accuracy")

In [49]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [27]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [28]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [29]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [88]:

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

loading configuration file config.json from cache at /home/mila/a/aasheesh.singh/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /home/mila/a/aasheesh.singh/.cache/huggingface/hub/models--distilbert-bas

In [50]:
training_args = TrainingArguments(
    output_dir="../models/",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="../logs/",
    load_best_model_at_end=True,
    logging_steps=50
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# trainer.train()

In [107]:
model = AutoModelForSequenceClassification.from_pretrained("../models/checkpoint-170/")

training_args = TrainingArguments(
    output_dir="../models/",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="../logs/",
    load_best_model_at_end=True,
    logging_steps=50
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


loading configuration file ../models/checkpoint-170/config.json
Model config DistilBertConfig {
  "_name_or_path": "../models/checkpoint-170/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading weights file ../models/checkpoint-170/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

All the w

In [31]:
from sklearn.metrics import classification_report

In [32]:
pred_val_labels = data["PredUseful"].to_numpy()[indices[train_len:]]

### Key Clip Identification: SRL results 

In [37]:
print(classification_report(val_labels, pred_val_labels))

              precision    recall  f1-score   support

           0       0.90      0.54      0.67      3561
           1       0.35      0.81      0.49      1093

    accuracy                           0.60      4654
   macro avg       0.63      0.67      0.58      4654
weighted avg       0.77      0.60      0.63      4654



In [108]:
with torch.no_grad():
    logits = trainer.predict(val_dataset)

pred = logits.predictions
pred = torch.sigmoid(torch.from_numpy(pred))
pred = torch.argmax(pred,dim=1)

***** Running Prediction *****
  Num examples = 4654
  Batch size = 64


### Key Clip Identification: DistilBERT

In [111]:
np.sum(all_labels)

3569

In [109]:
print(classification_report(val_labels, pred.numpy()))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89      3561
           1       0.63      0.75      0.68      1093

    accuracy                           0.84      4654
   macro avg       0.77      0.81      0.79      4654
weighted avg       0.85      0.84      0.84      4654

