In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
# Append the directory to your python path using os
os.chdir('/content/drive/MyDrive/Youtube_video_classifier')

In [None]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re


nltk.download('stopwords')
nltk.download('wordnet')
eng_stopwords=stopwords.words('english')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
if torch.cuda.is_available():
    device_name = torch.device("cuda")
else:
    device_name = torch.device('cpu')
print(device_name)

cuda


## Loading Model 2: Finetuned model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  # Load the fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/Youtube_video_classifier/model_2"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model_2 = AutoModelForSequenceClassification.from_pretrained(model_path)
model_2.to(device_name)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
#Function to make prediction from model_2
def predict_from_model_2(data,model=model_2):
  predictions = []
  descriptions = data["description"]
  with torch.no_grad():
    for description in descriptions:
      inputs = tokenizer(description,truncation=True, max_length=512, return_tensors="pt")
      inputs = inputs.to(device_name)

      # Make a prediction
      outputs = model(**inputs)


      # Get the predicted class probabilities
      probabilities = outputs.logits.softmax(dim=1)

      predictions.append(probabilities.cpu().detach().numpy().reshape(9,))
      torch.cuda.empty_cache()

  return np.array(predictions)

In [None]:
test_data = pd.read_csv("test_data.csv")

In [None]:
len(test_data)

940

In [None]:
predictions_2 = predict_from_model_2(test_data)

In [None]:
def preprocessing(df):
  description_with_tags=[]
  df = df[["category","category_label","description","tags"]]
  for index, row in df.iterrows():
    s=row["description"]
    if not pd.isnull(row["tags"]):
      s = s + row["tags"]
    s=re.sub(r'http\S+', '', s)
    s=re.findall(r'\w+', s.lower())
    s_dummy = []
    for word in s:                                 #removing all numerical digits from words
      res = ''.join([i for i in word if not i.isdigit()])
      if len(res)>0:s_dummy.append(res)
    s = s_dummy
    s = [lemmatizer.lemmatize(w) for w in s if w not in eng_stopwords] #Removing stopwords and lemmatisation
    description_with_tags.append(s)

  data = df.copy()
  data.loc[:,("description_with_tags")]=description_with_tags

  return data

## Loading Model 1: Neural Network

In [None]:
with open('vocabulary_model_1.pkl', 'rb') as fp:
    vocabulary_model_1 = pickle.load(fp)
model_1 = tf.keras.models.load_model('model_1.keras')

In [None]:
#Function to make prediction from model_1
def predict_from_model_1(data,model=model_1,vocab_dict=vocabulary_model_1):
  max_len = 1197
  descriptions = data["description_with_tags"]
  padded_descriptions = []
  for description in descriptions:
    unk_ID = vocab_dict['[UNK]']

    # First convert the words to integers by looking up the vocab_dict
    tensor = [vocab_dict.get(k,unk_ID) for k in description]

    if len(tensor) > max_len: return tensor[:max_len]

    # Then pad the tensor with zeroes up to the length max_len
    padded_tensor = tensor + [0]*(max_len-len(tensor))

    padded_descriptions.append(padded_tensor)
  inputs = np.array(padded_descriptions)
  predictions = model.predict(inputs, verbose=False)
  return predictions

In [None]:
predictions_1 = predict_from_model_1(preprocessing(test_data))

## Loading files for Model 0: Naive_Bayes_Classifier

In [None]:
with open('vocabulary_model_0.pkl', 'rb') as fp:
    vocabulary_model_0 = pickle.load(fp)
with open('word_category_probabilities.pkl', 'rb') as fp:
    word_category_probabilities = pickle.load(fp)
with open('prior_probabilities.pkl', 'rb') as fp:
    prior_probabilities = pickle.load(fp)

In [None]:
#Function to make prediction from model_0
def predict_from_model_0(data,vocabulary = vocabulary_model_0,word_category_probabilities=word_category_probabilities,prior_probabilities=prior_probabilities):
  categories = ['music','sports','gaming','education','film/animation','entertainment','news and politics','comedy','other']
  prediction = []
  for s in data["description_with_tags"]:
    prob = []

    for cat in categories:
      ans = 0
      for word in s:
        if word not in vocabulary: continue
        word_cat_prob = word_category_probabilities[word][cat]
        ans += np.log10(word_cat_prob)
      ans +=  np.log10(prior_probabilities[cat])
      prob.append(ans)
    logits = np.array(prob)
    prediction.append(tf.nn.softmax(tf.convert_to_tensor(logits)).numpy())
  return np.array(prediction)

In [None]:
predictions_0 = predict_from_model_0(preprocessing(test_data))

In [None]:
predictions_0.shape

(940, 9)

In [None]:
pred_0 = np.argmax(predictions_0,axis=1)
pred_1 = np.argmax(predictions_1,axis=1)
pred_2 = np.argmax(predictions_2,axis=1)

## Making Ensemble prediction using predictions from 3 models.

In [None]:
from collections import Counter
pred = np.array([pred_2 , pred_1 , pred_0])
final_pred = []
for i in range (len(pred[0])):
  arr = pred[:,i]
  pr = Counter(arr).most_common(1)[-1][0]
  final_pred.append(pr)


In [None]:
#Final accuracy
from sklearn.metrics import accuracy_score
acc = accuracy_score(true_values,final_pred)
acc

0.8148936170212766

In [None]:
#predictions from url
from Helper_functions import get_video_data
from collections import Counter
def predict_category(url,model0=predict_from_model_0,model1=predict_from_model_1,model2=predict_from_model_2):
  details = get_video_data(url)
  if (details==[None]*12):
    return None , None
  df = pd.DataFrame(data=[details],columns=["video_id","category_Id","category","category_label","title","description","tags","viewCount","likeCount","dislikeCount","commentCount","video_comments"])
  df.loc[:,('tags')]=str(df.loc[:,('tags')])
  category = df["category"][0]
  if pd.isnull(df["description"][0]):
    return None , category
  predictions_2 = predict_from_model_2(df)
  predictions_1 = predict_from_model_1(preprocessing(df))
  predictions_0 = predict_from_model_0(preprocessing(df))
  pred_0 = np.argmax(predictions_0,axis=1)
  pred_1 = np.argmax(predictions_1,axis=1)
  pred_2 = np.argmax(predictions_2,axis=1)
  pred = np.array([pred_2 , pred_1 , pred_0])
  final_pred = []
  for i in range (len(pred[0])):
    arr = pred[:,i]
    pr = Counter(arr).most_common(1)[-1][0]
    final_pred.append(pr)
  categories = ['music','sports','gaming','education','film/animation','entertainment','news and politics','comedy','other']
  return categories[final_pred[0]],category




In [None]:
# Paste url here and run the below cell to get prediction.
url = 'https://www.youtube.com/watch?v=prmmCg5bKxA'
predicted_category , original_category = predict_category(url)

In [None]:
if (not predicted_category) and (not original_category):
  print("Sorry! Not able to fetch data from given url. Try with another url")
else:
  if not predicted_category: print("No description is given for the video")
  else : print(f"The category predicted by the model using description and tags is : {predicted_category}")
  print(f"The actual category of the video is : {original_category} (from youtube data)")

The category predicted by the model using description and tags is : music
The actual category of the video is : music (from youtube data)
