# Predict Video Category Using Title

---



In [None]:
! pip install contractions
! pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.2/103.2 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 KB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/sim

In [None]:
import numpy as np
import pandas as pd
import re
import contractions

In [None]:
from google.colab import drive

drive.mount('/content/drive')

path='/content/drive/My Drive/CSCI544/Project/data/'

Mounted at /content/drive


In [None]:
data = pd.read_csv(path + "USvideos.csv")
data.fillna("", inplace=True)

TITLE = "title"
DESC = "description"
TAGS = "tags"
CAT = "category_id"
TXT = "text"
columns = [TITLE, DESC, TAGS, CAT]
data_types = {
    TITLE: str,
    DESC: str,
    TAGS: str,
    CAT: int
}
data = data[columns].astype(data_types)

## Cleaning the data
We clean the title and description field (removing extra spaces, html, urls)
Concatenate the title and the description

In [None]:
def data_cleaning(review):
    # remove HTML and url
    review = re.sub(r'<[^>]+>', ' ', review)
    review = re.sub(r'http\S+', ' ', review)

    # contraction
    review = contractions.fix(review)

    # remove non-alphabets
    review = re.sub(r'[^a-zA-Z0-9]', ' ', review)
    
    # remove extra spaces
    review = re.sub(r' +', ' ', review)
    review = review.strip()

    return review

In [None]:
data[TITLE] = data[TITLE].map(data_cleaning)
data[DESC] = data[DESC].map(data_cleaning)
data[TXT] = data[TITLE] + " " + data[DESC]

In [None]:
X = data[TXT].tolist()
y = data[CAT].tolist()

## Training Transformer Models

In [None]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# tokenize data
X_token = tokenizer(X, truncation=True, padding=True)

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import random_split

In [None]:
class YTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
dataset = YTDataset(X_token, y)
generator = torch.Generator().manual_seed(42)
train, val, test = random_split(dataset, [0.7, 0.1, 0.2], generator=generator)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy.compute(predictions=predictions, references=labels)
    pre = precision.compute(predictions=predictions, references=labels, average="weighted")
    rec = recall.compute(predictions=predictions, references=labels, average="weighted")
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted")
    res = {}
    for temp in [acc, pre, rec, f1_score]:
      res.update(temp)

    return res

## Distill Bert Uncased

In [None]:
training_args = TrainingArguments(
    output_dir="res",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=44)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4058,0.239162,0.943101,0.940556,0.943101,0.941367
2,0.1326,0.129376,0.972894,0.971331,0.972894,0.971942


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=3584, training_loss=0.4912247636488506, metrics={'train_runtime': 2861.5789, 'train_samples_per_second': 20.034, 'train_steps_per_second': 1.252, 'total_flos': 7600044216360960.0, 'train_loss': 0.4912247636488506, 'epoch': 2.0})

In [None]:
trainer.predict(test)

  _warn_prf(average, modifier, msg_start, len(result))


PredictionOutput(predictions=array([[-3.5492313 ,  0.1098493 , -0.49843186, ..., -3.83686   ,
        -4.1627817 , -1.6061742 ],
       [-5.3745494 ,  1.2639223 , -0.33401605, ..., -4.383853  ,
        -3.9622018 , -2.6931388 ],
       [-4.606158  ,  0.6339609 , -0.9063602 , ..., -3.8837626 ,
        -4.2728167 , -2.1655104 ],
       ...,
       [-3.8204181 , -0.36584726, -0.6219143 , ..., -3.8171935 ,
        -3.6076913 , -0.4522275 ],
       [-4.414167  ,  0.8612328 ,  0.27090776, ..., -4.0073643 ,
        -4.1139917 , -1.2847471 ],
       [-4.5221586 ,  0.72109765, -0.55720747, ..., -4.0466337 ,
        -4.380435  , -2.0694227 ]], dtype=float32), label_ids=array([25, 24, 10, ..., 17, 20, 10]), metrics={'test_loss': 0.14679338037967682, 'test_accuracy': 0.9689827817804372, 'test_precision': 0.967632679000159, 'test_recall': 0.9689827817804372, 'test_f1': 0.9680087505608332, 'test_runtime': 130.7581, 'test_samples_per_second': 62.627, 'test_steps_per_second': 3.916})

In [None]:
model2 = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=44)

trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
