In [28]:
import pandas as pd
from transformers import AutoTokenizer
import numpy as np
import torch
from sklearn.metrics import classification_report
import pickle as pkl
from transformers import DataCollatorWithPadding
import evaluate
import os

In [None]:
with open("../data/full_master.pkl", "rb") as f:
    data = pkl.load(f)

In [29]:
# data = pd.read_csv("../youcook2/reviewed_0812.csv")
split_df = pd.read_csv("../data/train_val_split.csv")

In [136]:
train_url = split_df[split_df.Split == "train"]["VideoUrl"]
val_url = split_df[split_df.Split == "val"]["VideoUrl"]

In [137]:
len(train_url), len(val_url)

(253, 99)

In [138]:
def get_id(url):
    url_parts = url.split("?")

    # Extracting the query parameters part of the URL
    query_params = url_parts[1] if len(url_parts) > 1 else ""

    # Splitting the query parameters by "&" to separate individual key-value pairs
    query_params = query_params.split("&")

    # Extracting the video ID from the query parameters
    video_id = ""
    for param in query_params:
        if param.startswith("v="):
            video_id = param[2:]
        break
    return video_id

In [139]:
train_df = data[data.VideoUrl.isin(train_url.values)]
val_df = data[data.VideoUrl.isin(val_url.values)]
train_df["VideoID"] = train_df["VideoUrl"].apply(get_id)
val_df["VideoID"] = val_df["VideoUrl"].apply(get_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### Sentence Statistics

In [8]:
transcripts = []
num_sentences = []
for url in data["VideoUrl"].unique():
    sentences = data[data.VideoUrl == url]["Sentence"].tolist()
#     story = " ".join(sentences)
#     num_words = len(story.split(" "))
    num_sentences.append(len(sentences))
#     transcripts.append(num_words)

print(np.mean(num_sentences), np.median(num_sentences), np.min(num_sentences), np.max(num_sentences))

44.06534090909091 40.0 1 136


In [27]:
len(transcripts), np.mean(transcripts), np.median(transcripts), np.min(transcripts), np.max(transcripts)

(352, 717.7556818181819, 642.0, 20, 2083)

In [142]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [144]:
seed = 23456
np.random.seed(seed)

### Approach 1: Using current text only

In [145]:
train_data, train_labels = train_df["Sentence"].values, train_df["IsUsefulSentence"].values
val_data, val_labels = val_df["Sentence"].values, val_df["IsUsefulSentence"].values

### Approach 2: Using neighbourhood context

In [11]:
# num_words = []
# context_data = []
# for i in range(data.shape[0]):
#     curr_text, url = data.iloc[i]["Sentence"], data.iloc[i]["VideoUrl"]
#     context = []
#     for j in range(-3, 3):
#         index = i+j
#         if index < 0 or index > data.shape[0] - 1:
#             continue
#         context_text, context_url = data.iloc[index]["Sentence"], data.iloc[index]["VideoUrl"]
#         if context_url == url:
#             context.append(context_text)
#     num_words.append(len(" ".join(context).split(" ")))
#     context_data.append(" ".join(context))
# data["context"] = context_data

# train_data, train_labels = train_df["context"].values, train_df["IsUsefulSentence"].values
# val_data, val_labels = val_df["context"].values, val_df["IsUsefulSentence"].values

In [147]:
len(train_data), len(val_data)

(10327, 4346)

In [146]:
train_encodings = tokenizer(list(train_data), truncation=True, padding=True)
val_encodings = tokenizer(list(val_data), truncation=True,padding=True)

In [148]:
class YouCookData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [149]:
train_dataset = YouCookData(train_encodings, list(train_labels))
val_dataset = YouCookData(val_encodings, list(val_labels))

In [150]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [151]:
accuracy = evaluate.load("accuracy")

In [152]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [153]:
os.environ["WANDB_DISABLED"] = "true"

In [154]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [155]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [49]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifi

In [60]:
training_args = TrainingArguments(
    output_dir="../models/keystep_no_context",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="../logs/",
    load_best_model_at_end=True,
    logging_steps=200
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [156]:
len(val_data), len(val_labels)

(4346, 4346)

In [62]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.345065,0.837669
2,No log,0.352638,0.842829
3,0.320800,0.373735,0.840249


TrainOutput(global_step=255, training_loss=0.30083263995600684, metrics={'train_runtime': 203.3258, 'train_samples_per_second': 160.235, 'train_steps_per_second': 1.254, 'total_flos': 1534127711669280.0, 'train_loss': 0.30083263995600684, 'epoch': 3.0})

In [157]:
model = AutoModelForSequenceClassification.from_pretrained("../models/keystep_no_context/checkpoint-170/")

training_args = TrainingArguments(
    output_dir="../models/",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="../logs/",
    load_best_model_at_end=True,
    logging_steps=50
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


### Key Clip Identification: SRL results 

In [37]:
# pred_SRL_labels = data["PredUseful"].to_numpy()[indices[train_len:]]
# print(classification_report(val_labels, pred_SRL_labels))

              precision    recall  f1-score   support

           0       0.90      0.54      0.67      3561
           1       0.35      0.81      0.49      1093

    accuracy                           0.60      4654
   macro avg       0.63      0.67      0.58      4654
weighted avg       0.77      0.60      0.63      4654



### Train dataset Predictions

In [158]:
with torch.no_grad():
    logits = trainer.predict(train_dataset)

train_pred = logits.predictions
train_pred = torch.sigmoid(torch.from_numpy(train_pred))
train_pred = torch.argmax(train_pred,dim=1)

### Train dataset results

In [159]:
print(classification_report(train_labels, train_pred.numpy()))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      7911
           1       0.77      0.82      0.79      2416

    accuracy                           0.90     10327
   macro avg       0.86      0.87      0.86     10327
weighted avg       0.90      0.90      0.90     10327



### Test dataset Predictions

In [160]:
with torch.no_grad():
    logits = trainer.predict(val_dataset)

val_pred = logits.predictions
val_pred = torch.sigmoid(torch.from_numpy(val_pred))
val_pred = torch.argmax(val_pred,dim=1)

## Key Clip Identification: DistilBERT

### [BEST] Result: Approach 1: No context and using new split (Best checkpoint - 170)

In [161]:
print(classification_report(val_labels, val_pred.numpy()))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      3346
           1       0.65      0.69      0.67      1000

    accuracy                           0.84      4346
   macro avg       0.78      0.79      0.78      4346
weighted avg       0.85      0.84      0.85      4346



### Result: Approach 2: With context  and new split (Best checkpoint-255)

In [106]:
print(classification_report(val_labels, val_pred.numpy()))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      3590
           1       0.56      0.48      0.51      1061

    accuracy                           0.80      4651
   macro avg       0.71      0.68      0.69      4651
weighted avg       0.79      0.80      0.79      4651



## Save Keyclip predictions for Stage-2 of Pipeline

### Dataset wide Keyclip prediction stats

In [195]:
print(classification_report(data_checkpoint_v2["IsUsefulSentence"].values, data_checkpoint_v2["IsPredUseful"].values))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92     11257
           1       0.73      0.79      0.76      3416

    accuracy                           0.88     14673
   macro avg       0.83      0.85      0.84     14673
weighted avg       0.89      0.88      0.88     14673



In [181]:
train_df["IsPredUseful"] = train_pred.numpy()
val_df["IsPredUseful"] = val_pred.numpy()
data_checkpoint_v2 = pd.concat((train_df, val_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [192]:
with open("../data/full_master_updated.pkl", "wb") as f:
    pkl.dump(data_checkpoint_v2, f)