In [1]:
import pandas as pd
from transformers import AutoTokenizer
import numpy as np
import torch
from sklearn.metrics import classification_report

In [6]:
data = pd.read_csv("../youcook2/reviewed_0812.csv")

In [2]:
split_df = pd.read_csv("../data/train_val_split.csv")

In [3]:
train_url = split_df[split_df.Split == "train"]["VideoUrl"]
val_url = split_df[split_df.Split == "val"]["VideoUrl"]

In [4]:
train_url

0      https://www.youtube.com/watch?v=k38Al8giI-U
1      https://www.youtube.com/watch?v=6uB1pcyBCBw
2      https://www.youtube.com/watch?v=91z7e22XGy8
3      https://www.youtube.com/watch?v=0B-59Ok_r1Y
4      https://www.youtube.com/watch?v=LqCDIssigHo
                          ...                     
248    https://www.youtube.com/watch?v=Gr7E2bhg0iw
249    https://www.youtube.com/watch?v=oP6PR4KfH0A
250    https://www.youtube.com/watch?v=qRSZEN6g8jY
251    https://www.youtube.com/watch?v=qPSqTqkHhGg
252    https://www.youtube.com/watch?v=0qbCigxf_sc
Name: VideoUrl, Length: 253, dtype: object

In [132]:
len(train_url), len(val_url)

(253, 99)

In [133]:
num_words = []
context_data = []
for i in range(data.shape[0]):
    curr_text, url = data.iloc[i]["Sentence"], data.iloc[i]["VideoUrl"]
    context = []
    for j in range(-3, 3):
        index = i+j
        if index < 0 or index > data.shape[0] - 1:
            continue
        context_text, context_url = data.iloc[index]["Sentence"], data.iloc[index]["VideoUrl"]
        if context_url == url:
            context.append(context_text)
    num_words.append(len(" ".join(context).split(" ")))
    context_data.append(" ".join(context))
data["context"] = context_data

In [134]:
train_df = data[data.VideoUrl.isin(train_url.values)]
val_df = data[data.VideoUrl.isin(val_url.values)]

In [135]:
train_df.shape, np.sum(train_df["IsUsefulSentence"].values)

((10860, 15), 2508)

In [136]:
val_df.shape, np.sum(val_df["IsUsefulSentence"].values)

((4651, 15), 1061)

In [8]:
transcripts = []
num_sentences = []
for url in data["VideoUrl"].unique():
    sentences = data[data.VideoUrl == url]["Sentence"].tolist()
#     story = " ".join(sentences)
#     num_words = len(story.split(" "))
    num_sentences.append(len(sentences))
#     transcripts.append(num_words)

print(np.mean(num_sentences), np.median(num_sentences), np.min(num_sentences), np.max(num_sentences))

44.06534090909091 40.0 1 136


In [27]:
# len(transcripts), np.mean(transcripts), np.median(transcripts), np.min(transcripts), np.max(transcripts)

(352, 717.7556818181819, 642.0, 20, 2083)

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [8]:
def tokenize_text(text):
    return tokenizer(text, truncation=True)

In [9]:
seed = 23456
np.random.seed(seed)

### Old approch of train/val split

In [19]:
# train_len = int(0.70*len(data))
# indices = list(range(len(data)))
# np.random.shuffle(indices)
# all_text, all_labels = data["Sentence"].to_numpy(), data["IsUsefulSentence"].to_numpy()
# train_data, train_labels = all_text[indices[:train_len]], all_labels[indices[:train_len]]
# val_data, val_labels = all_text[indices[train_len:]], all_labels[indices[train_len:]]

### Approach 1: Using current text only

In [107]:
train_data, train_labels = train_df["Sentence"].values, train_df["IsUsefulSentence"].values
val_data, val_labels = val_df["Sentence"].values, val_df["IsUsefulSentence"].values

### Approach 2: Using neighbourhood context

In [78]:
# train_data, train_labels = train_df["context"].values, train_df["IsUsefulSentence"].values
# val_data, val_labels = val_df["context"].values, val_df["IsUsefulSentence"].values

In [79]:
len(train_data), len(val_data)

(10860, 4651)

In [108]:
train_encodings = tokenizer(list(train_data), truncation=True, padding=True)
val_encodings = tokenizer(list(val_data), truncation=True,padding=True)

In [109]:
import torch

class YouCookData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [110]:
train_dataset = YouCookData(train_encodings, list(train_labels))
val_dataset = YouCookData(val_encodings, list(val_labels))

In [111]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [112]:
import evaluate
accuracy = evaluate.load("accuracy")

In [113]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [114]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [115]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [116]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [117]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [60]:
training_args = TrainingArguments(
    output_dir="../models/keystep_no_context",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="../logs/",
    load_best_model_at_end=True,
    logging_steps=200
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [61]:
len(val_data), len(val_labels)

(4651, 4651)

In [62]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.345065,0.837669
2,No log,0.352638,0.842829
3,0.320800,0.373735,0.840249


TrainOutput(global_step=255, training_loss=0.30083263995600684, metrics={'train_runtime': 203.3258, 'train_samples_per_second': 160.235, 'train_steps_per_second': 1.254, 'total_flos': 1534127711669280.0, 'train_loss': 0.30083263995600684, 'epoch': 3.0})

In [125]:
model = AutoModelForSequenceClassification.from_pretrained("../models/keystep_no_context/checkpoint-170/")

training_args = TrainingArguments(
    output_dir="../models/",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="../logs/",
    load_best_model_at_end=True,
    logging_steps=50
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [32]:
pred_val_labels = data["PredUseful"].to_numpy()[indices[train_len:]]

### Key Clip Identification: SRL results 

In [37]:
print(classification_report(val_labels, pred_val_labels))

              precision    recall  f1-score   support

           0       0.90      0.54      0.67      3561
           1       0.35      0.81      0.49      1093

    accuracy                           0.60      4654
   macro avg       0.63      0.67      0.58      4654
weighted avg       0.77      0.60      0.63      4654



In [126]:
with torch.no_grad():
    logits = trainer.predict(val_dataset)

pred = logits.predictions
pred = torch.sigmoid(torch.from_numpy(pred))
pred = torch.argmax(pred,dim=1)

### Key Clip Identification: DistilBERT

## Result: No context and old shuffle

In [109]:
print(classification_report(val_labels, pred.numpy()))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89      3561
           1       0.63      0.75      0.68      1093

    accuracy                           0.84      4654
   macro avg       0.77      0.81      0.79      4654
weighted avg       0.85      0.84      0.84      4654



## Result: Approach 1: No context and using new split (Best checkpoint - 170)

In [124]:
print(classification_report(val_labels, pred.numpy()))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90      3590
           1       0.64      0.67      0.66      1061

    accuracy                           0.84      4651
   macro avg       0.77      0.78      0.78      4651
weighted avg       0.84      0.84      0.84      4651



In [127]:
print(classification_report(val_labels, pred.numpy()))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      3590
           1       0.65      0.69      0.67      1061

    accuracy                           0.84      4651
   macro avg       0.78      0.79      0.78      4651
weighted avg       0.85      0.84      0.84      4651



## Result: Approach 2: With context  and new split (Best checkpoint-255)

In [47]:
print(classification_report(val_labels, pred.numpy()))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87      3590
           1       0.57      0.42      0.49      1061

    accuracy                           0.80      4651
   macro avg       0.71      0.66      0.68      4651
weighted avg       0.78      0.80      0.78      4651



In [106]:
print(classification_report(val_labels, pred.numpy()))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      3590
           1       0.56      0.48      0.51      1061

    accuracy                           0.80      4651
   macro avg       0.71      0.68      0.69      4651
weighted avg       0.79      0.80      0.79      4651

