In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
# Append the directory to your python path using os
os.chdir('/content/drive/MyDrive/Youtube_video_classifier')

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("youtube_data.csv")

In [None]:
#removing rows which have no description at all
df.dropna(subset=["description"],inplace=True)
df.reset_index(inplace=True)

In [None]:
#this time using only description
data = df[["description","category_label"]]
data.head()

Unnamed: 0,description,category_label
0,Camila Cabello feat. Ed Sheeran - Bam Bam (Off...,0
1,"""Un-Break My Heart"" by Toni Braxton\nListen to...",0
2,New Album ‘The Beautiful & Damned’ Available E...,0
3,지민 (Jimin) 'Like Crazy' Official MV\n\n'FACE' ...,0
4,REMASTERED IN HD!\nGet Rihanna’s eighth studio...,0


In [None]:
from sklearn.model_selection import train_test_split
train,val = train_test_split(data,test_size = 0.15,random_state=0,stratify=data["category_label"])
print(len(train),len(val))

5323 940


In [None]:
train

Unnamed: 0,description,category_label
712,▪︎Karate Drama▪︎\n▪︎ANNUAL SPORTS MEET ▪︎\n▪︎2...,1
6084,We are pleased to share with you a short video...,8
5376,The sports day of our school/essay writing in ...,8
4492,Part one of FRONTLINE’s four-hour series on th...,6
4727,"After only three hours of deliberation, a jury...",6
...,...,...
3317,Click here to subscribe to my channel and cli...,5
4777,This body cam video allegedly shows Hamas-affi...,6
758,Bangladesh vs New Zealand Highlights | 2nd Tes...,1
2050,#Sports #SportsAndGames #Preparestudies #Handw...,3


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


## Creating a Training and Validation Dataset using Hugging Face's Datasets Library

In [None]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict

raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(train),
    "val" : Dataset.from_pandas(val),
    })
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['description', 'category_label', '__index_level_0__'],
        num_rows: 5323
    })
    val: Dataset({
        features: ['description', 'category_label', '__index_level_0__'],
        num_rows: 940
    })
})

In [None]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'description': '▪︎Karate Drama▪︎\n▪︎ANNUAL SPORTS MEET ▪︎\n▪︎25th JANUARY  2020▪︎\n▪︎MODERN ENGLISH SCHOOL,BHATAPARA ▪︎\n☆COACH - MAST. NIKHIL RAO\n\nMore NP Videos :\n\nPyramid display by Boys on Annual Sports Meet 2020\nVideo Link :https://youtu.be/ad8_QluyJTA\n\nPyramid display by Girls on Annual Sports Meet 2020\nVideo Link : https://youtu.be/ILHLgc0YU0I\n\nKarate demo and stunt on Annual Sports Meet 2020\nVideo Link : https://youtu.be/VG6Ylai0sBI\n\nMass P.T on Annual Sports Meet 2020\nVideo Link : https://youtu.be/H2v3S40ImMA\n\nMarch Past on Annual Sports Meet 2020\nVideo Link : https://youtu.be/1B1ojkZHrQ4\n\n#Ninjaperfect #NP #karatedrama #drama #sportsdrama',
 'category_label': 1,
 '__index_level_0__': 712}

In [None]:
!pip install transformers



## Tokenizing and Padding Text Data with BERT Tokenizer using Hugging Face Transformers

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding


checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["description"] ,truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5323 [00:00<?, ? examples/s]

Map:   0%|          | 0/940 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['description', 'category_label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5323
    })
    val: Dataset({
        features: ['description', 'category_label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 940
    })
})

In [None]:
# We will remove some columns from the dataset as the model don't expect these columns in training data
train_tokenized=tokenized_datasets["train"].remove_columns(["description","__index_level_0__"])
train_tokenized=train_tokenized.rename_column("category_label", "labels")
val_tokenized=tokenized_datasets["val"].remove_columns(["description","__index_level_0__"])
val_tokenized=val_tokenized.rename_column("category_label", "labels")
tokenized_datasets=DatasetDict({
    "train": train_tokenized,
    "val" : val_tokenized,
    })
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5323
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 940
    })
})

## Loading the data using torch DataLoader

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=15, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["val"], batch_size=15, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([15]),
 'input_ids': torch.Size([15, 512]),
 'token_type_ids': torch.Size([15, 512]),
 'attention_mask': torch.Size([15, 512])}

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=9)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(2.1027, grad_fn=<NllLossBackward0>) torch.Size([15, 9])


In [None]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [None]:
from transformers import AdamW
from accelerate import Accelerator
accelerator = Accelerator()
optimizer = AdamW(model.parameters(), lr=5e-5)



In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

1065


In [None]:
import torch

train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
   train_dataloader, eval_dataloader, model, optimizer)

## Finetuning the model with our data

In [None]:
from sklearn.metrics import accuracy_score

from tqdm.notebook import tqdm
import numpy as np

model.train()
# These two lists will be used to store average loss and accuracy for each epoch
total_loss, acc = list(), list()
for epoch in range(num_epochs):
    print("\n\nEpoch:", epoch+1)
    batch_loss, batch_preds, batch_target = 0, list(), list()
    for batch in  tqdm(train_dataloader, total=len(train_dataloader)):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        batch_loss+= loss.item()
        logits = outputs.logits
        batch_preds.extend (np.argmax(logits.cpu().detach().numpy(), axis=1))
        batch_target.extend(batch["labels"].cpu().detach().numpy())


    # Calculate average loss
    total_loss.append(batch_loss/len(train_dataloader))
    # Calculate accuracy for this epoch
    acc.append(accuracy_score(batch_target, batch_preds))
    print("Loss:", total_loss[-1], "\tAcc:", acc[-1])




Epoch: 1


  0%|          | 0/355 [00:00<?, ?it/s]

Loss: 1.0608615462209137 	Acc: 0.6409919218485817


Epoch: 2


  0%|          | 0/355 [00:00<?, ?it/s]

Loss: 0.5677854773024438 	Acc: 0.8224685327822656


Epoch: 3


  0%|          | 0/355 [00:00<?, ?it/s]

Loss: 0.3219060633488944 	Acc: 0.9088859665602104


## Evaluating model on validation data

In [None]:
eval_batch_preds, eval_batch_target =  list(), list()
for batch in eval_dataloader:

  with torch.no_grad():
    outputs = model(**batch)
  logits = outputs.logits
  eval_batch_preds.extend (np.argmax(logits.cpu().detach().numpy(), axis=1))
  eval_batch_target.extend(batch["labels"].cpu().detach().numpy())
accuracy = accuracy_score(eval_batch_target,eval_batch_preds)
print("accuracy: ",accuracy)

accuracy:  0.8138297872340425


In [None]:
model.save_pretrained("/content/drive/MyDrive/Youtube_video_classifier/model_2")

In [None]:
from transformers import AutoModelForSequenceClassification
loaded_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Youtube_video_classifier/model_2")

In [None]:
from Helper_functions import get_video_data

In [None]:
#Paste the path of the video
url = 'https://www.youtube.com/watch?v=ElZfdU54Cp8'

In [None]:
def predict_category(url):
  import pandas as pd
  details = get_video_data(url)
  df = pd.DataFrame(data=[details],columns=["video_id","category_Id","category","category_label","title","description","tags","viewCount","likeCount","dislikeCount","commentCount","video_comments"])
  category = df["category"][0]
  if pd.isnull(df["description"][0]):
    return None , category

  import torch
  from transformers import AutoTokenizer, AutoModelForSequenceClassification

  # Load the fine-tuned model and tokenizer
  model_path = "/content/drive/MyDrive/Youtube_video_classifier/model_2"
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
  model = AutoModelForSequenceClassification.from_pretrained(model_path)



  # Example text for prediction
  description = df["description"][0]

  # Tokenize the input text
  inputs = tokenizer(description, return_tensors="pt")

  # Make a prediction
  outputs = model(**inputs)

  # Get the predicted class probabilities
  probabilities = outputs.logits.softmax(dim=1)

  # Get the predicted class index
  predicted_class = torch.argmax(probabilities, dim=1).item()

  categories = ['music','sports','gaming','education','film/animation','entertainment','news and politics','comedy','other']

  return categories[predicted_class] , category


In [None]:
predicted_category , category = predict_category(url)

In [None]:
if not predicted_category: print("No description is given for the video")
else : print(f"The category predicted by the model using description and tags is : {predicted_category}")
print(f"The actual category of the video is : {category} (from youtube data)")

The category predicted by the model using description and tags is : music
The actual category of the video is : music (from youtube data)
