In [1]:
!pip install torch
!pip install tokenizers
!pip install transformers

[0m

# Pre-training BERT on Youtube Comments Dataset

In [2]:
import pandas as pd
import numpy as np
import random

from tqdm import notebook
from nltk.tokenize import RegexpTokenizer
import re

from transformers import BertTokenizer, BertForMaskedLM, AdamW
import torch



In [3]:
gb_comments_df = pd.read_csv('/kaggle/input/youtube-videos-title-description-comments/GBcomments.csv', on_bad_lines='skip')
gb_comments_df.head()

Unnamed: 0,video_id,comment_text,likes,replies
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0


In [4]:
us_comments_df = pd.read_csv('/kaggle/input/youtube-videos-title-description-comments/UScomments.csv', on_bad_lines='skip')
us_comments_df.head()

  us_comments_df = pd.read_csv('/kaggle/input/youtube-videos-title-description-comments/UScomments.csv', on_bad_lines='skip')


Unnamed: 0,video_id,comment_text,likes,replies
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0
4,XpVt6Z1Gjjo,trending 😉,3,0


In [5]:
comments_df = pd.concat([gb_comments_df, us_comments_df], axis=0)
print("Len of GB data - ", len(gb_comments_df))
print("Len of US data - ", len(us_comments_df))
print("Len of combined data - ", len(comments_df))
comments_df.head()

Len of GB data -  718452
Len of US data -  691400
Len of combined data -  1409852


Unnamed: 0,video_id,comment_text,likes,replies
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0


In [6]:
len(comments_df.comment_text.unique())

622630

In [7]:
comments = random.choices(comments_df.comment_text.unique(), k=100000)
comments[:5]

['They put a GoPro on a turtle.',
 'nate diaz killing now @_@ what if nk drop a nuke how many will die a couple of millions..??',
 'the free trial dosent work',
 'Valentina, never leave it in the hands of the judges. To win a title fight you need to knock out or submit your opponent. You did not do that so stop whining!!!!',
 'Christ, Trump is such an embarrassing man child that it’s even impossible for Faux News to ignore it at this point. How hilarious.']

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)

Device:  cuda


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

text = ("After Abraham Lincoln won the November 1860 presidential "
        "election on an anti-slavery platform, an initial seven "
        "slave states declared their secession from the country "
        "to form the Confederacy. War broke out in April 1861 "
        "when secessionist forces attacked Fort Sumter in South "
        "Carolina, just over a month after Lincoln's "
        "inauguration.")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
inputs = tokenizer(text, return_tensors='pt')

In [11]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [12]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [13]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

Masking 15% of the tokens

In [14]:
rand = torch.rand(inputs.input_ids.shape)
# where the random array is less than 0.15, we set true
mask_arr = rand < 0.15
mask_arr

tensor([[False,  True, False, False, False, False, False, False, False,  True,
         False, False, False, False, False, False, False, False,  True,  True,
          True, False, False, False, False, False, False,  True,  True,  True,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False,  True, False, False, False, False,
         False, False, False, False, False, False,  True, False,  True, False,
         False, False]])

We don’t want to place a MASK token over other special tokens such as CLS or SEP tokens (101 and 102 respectively).

So, we need to add an additional condition. A check for positions containing the token ids 101 or 102.

In [15]:
(inputs.input_ids != 101) * (inputs.input_ids != 102)

tensor([[False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True, False]])

In [16]:
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102)
mask_arr

tensor([[False,  True, False, False, False, False, False, False, False,  True,
         False, False, False, False, False, False, False, False,  True,  True,
          True, False, False, False, False, False, False,  True,  True,  True,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False,  True, False, False, False, False,
         False, False, False, False, False, False,  True, False,  True, False,
         False, False]])

Getting indices to be masked

In [17]:
# create selection from mask_arr
selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
selection

[1, 9, 18, 19, 20, 27, 28, 29, 45, 56, 58]

masking

In [18]:
inputs.input_ids[0, selection] = 103 # mask token = 103

In [19]:
inputs.input_ids

tensor([[  101,   103,  8181,  5367,  2180,  1996,  2281,  7313,  4883,   103,
          2006,  2019,  3424,  1011,  8864,  4132,  1010,  2019,   103,   103,
           103,  2163,  4161,  2037, 22965,  2013,  1996,   103,   103,   103,
          1996, 18179,  1012,  2162,  3631,  2041,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,   103,  3334,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,   103,  1005,   103, 17331,
          1012,   102]])

In [20]:
outputs = model(**inputs)

In [21]:
outputs.keys()

odict_keys(['loss', 'logits'])

In [22]:
outputs.loss

tensor(1.2177, grad_fn=<NllLossBackward0>)

## Training

In [23]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    cleantext = re.sub(r'http\S+', '',cleantext)
    return cleantext

In [24]:
def clean(sentences):
    cleaned_sentences = []
    for sentence in notebook.tqdm(sentences):
        cleaned_sentences.append(preprocess(sentence))
    return cleaned_sentences

In [25]:
text = clean(comments)
text[:5]

  0%|          | 0/100000 [00:00<?, ?it/s]

['they put a gopro on a turtle.',
 'nate diaz killing now @_@ what if nk drop a nuke how many will die a couple of millions..??',
 'the free trial dosent work',
 'valentina, never leave it in the hands of the judges. to win a title fight you need to knock out or submit your opponent. you did not do that so stop whining!!!!',
 'christ, trump is such an embarrassing man child that it’s even impossible for faux news to ignore it at this point. how hilarious.']

In [26]:
%%time
inputs = tokenizer(text, return_tensors='pt', max_length=256, truncation=True, padding='max_length')

CPU times: user 1min 37s, sys: 674 ms, total: 1min 38s
Wall time: 1min 38s


In [27]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [28]:
inputs.input_ids.shape

torch.Size([100000, 256])

In [29]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [30]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [31]:
selection = []

for i in notebook.tqdm(range(inputs.input_ids.shape[0])):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

  0%|          | 0/100000 [00:00<?, ?it/s]

In [32]:
selection[:5]

[[2, 9], [4, 7, 25, 26], [], [6, 17, 32, 33, 34, 38], [5, 8, 16, 20, 23]]

In [33]:
for i in notebook.tqdm(range(inputs.input_ids.shape[0])):
    inputs.input_ids[i, selection[i]] = 103

  0%|          | 0/100000 [00:00<?, ?it/s]

In [34]:
inputs.input_ids[:5]

tensor([[  101,  2027,   103,  ...,     0,     0,     0],
        [  101,  8253, 12526,  ...,     0,     0,     0],
        [  101,  1996,  2489,  ...,     0,     0,     0],
        [  101, 24632,  2050,  ...,     0,     0,     0],
        [  101,  4828,  1010,  ...,     0,     0,     0]])

In [35]:
class CommentsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [36]:
dataset = CommentsDataset(inputs)

In [37]:
loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [38]:
model.to(device)
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [39]:
optim = AdamW(model.parameters(), lr=5e-5)



In [40]:
epochs = 2
for epoch in range(epochs):
    loop = notebook.tqdm(loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/3125 [00:00<?, ?it/s]

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  0%|          | 0/3125 [00:00<?, ?it/s]

In [41]:
model.save_pretrained('bert-base-uncased-youtube-comments')