In [1]:
!pip install torch
!pip install tokenizers
!pip install transformers

[0m

# Pre-training BERT on Youtube Comments Dataset

In [2]:
import pandas as pd
import numpy as np

from transformers import BertTokenizer, BertForMaskedLM
import torch



In [3]:
gb_comments_df = pd.read_csv('/kaggle/input/youtube-videos-title-description-comments/GBcomments.csv', on_bad_lines='skip')
gb_comments_df.head()

Unnamed: 0,video_id,comment_text,likes,replies
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0


In [4]:
us_comments_df = pd.read_csv('/kaggle/input/youtube-videos-title-description-comments/UScomments.csv', on_bad_lines='skip')
us_comments_df.head()

  us_comments_df = pd.read_csv('/kaggle/input/youtube-videos-title-description-comments/UScomments.csv', on_bad_lines='skip')


Unnamed: 0,video_id,comment_text,likes,replies
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0
4,XpVt6Z1Gjjo,trending 😉,3,0


In [5]:
comments_df = pd.concat([gb_comments_df, us_comments_df], axis=0)
print("Len of GB data - ", len(gb_comments_df))
print("Len of US data - ", len(us_comments_df))
print("Len of combined data - ", len(comments_df))
comments_df.head()

Len of GB data -  718452
Len of US data -  691400
Len of combined data -  1409852


Unnamed: 0,video_id,comment_text,likes,replies
0,jt2OHQh0HoQ,It's more accurate to call it the M+ (1000) be...,0,0
1,jt2OHQh0HoQ,To be there with a samsung phone\n😂😂😂,1,0
2,jt2OHQh0HoQ,"Thank gosh, a place I can watch it without hav...",0,0
3,jt2OHQh0HoQ,What happened to the home button on the iPhone...,0,0
4,jt2OHQh0HoQ,Power is the disease. Care is the cure. Keep...,0,0


In [6]:
len(comments_df.comment_text.unique())

622630

In [7]:
comments = comments_df.comment_text.unique()
comments[:5]

array(["It's more accurate to call it the M+ (1000) because the price is closer than calling it the X (10).",
       'To be there with a samsung phone\\n😂😂😂',
       'Thank gosh, a place I can watch it without having to be at HD... my speed doesn’t support HD',
       'What happened to the home button on the iPhone X? *****Cough****copying Samsung******cough',
       'Power is the disease.\xa0 Care is the cure.\xa0 Keep caring for yourself and others as best as you can.\xa0 This is life.'],
      dtype=object)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)

Device:  cuda


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

text = ("After Abraham Lincoln won the November 1860 presidential "
        "election on an anti-slavery platform, an initial seven "
        "slave states declared their secession from the country "
        "to form the Confederacy. War broke out in April 1861 "
        "when secessionist forces attacked Fort Sumter in South "
        "Carolina, just over a month after Lincoln's "
        "inauguration.")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
inputs = tokenizer(text, return_tensors='pt')

In [11]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [12]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [13]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

Masking 15% of the tokens

In [14]:
rand = torch.rand(inputs.input_ids.shape)
# where the random array is less than 0.15, we set true
mask_arr = rand < 0.15
mask_arr

tensor([[False, False, False,  True, False,  True, False, False, False, False,
         False, False,  True, False, False, False,  True,  True, False, False,
         False, False,  True,  True, False, False, False, False,  True, False,
         False, False, False, False, False,  True, False, False, False, False,
         False, False, False, False, False, False,  True, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False]])

We don’t want to place a MASK token over other special tokens such as CLS or SEP tokens (101 and 102 respectively).

So, we need to add an additional condition. A check for positions containing the token ids 101 or 102.

In [15]:
(inputs.input_ids != 101) * (inputs.input_ids != 102)

tensor([[False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True, False]])

In [16]:
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102)
mask_arr

tensor([[False, False, False,  True, False,  True, False, False, False, False,
         False, False,  True, False, False, False,  True,  True, False, False,
         False, False,  True,  True, False, False, False, False,  True, False,
         False, False, False, False, False,  True, False, False, False, False,
         False, False, False, False, False, False,  True, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False]])

Getting indices to be masked

In [17]:
# create selection from mask_arr
selection = torch.flatten((mask_arr[0]).nonzero()).tolist()
selection

[3, 5, 12, 16, 17, 22, 23, 28, 35, 46]

masking

In [18]:
inputs.input_ids[0, selection] = 103 # mask token = 103

In [19]:
inputs.input_ids

tensor([[  101,  2044,  8181,   103,  2180,   103,  2281,  7313,  4883,  2602,
          2006,  2019,   103,  1011,  8864,  4132,   103,   103,  3988,  2698,
          6658,  2163,   103,   103, 22965,  2013,  1996,  2406,   103,  2433,
          1996, 18179,  1012,  2162,  3631,   103,  1999,  2258,  6863,  2043,
         22965,  2923,  2749,  4457,  3481,  7680,   103,  1999,  2148,  3792,
          1010,  2074,  2058,  1037,  3204,  2044,  5367,  1005,  1055, 17331,
          1012,   102]])

In [20]:
outputs = model(**inputs)

In [21]:
outputs.keys()

odict_keys(['loss', 'logits'])

In [22]:
outputs.loss

tensor(0.7331, grad_fn=<NllLossBackward0>)

## Training

In [23]:
from tqdm import notebook
from nltk.tokenize import RegexpTokenizer
import re

In [24]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    cleantext = re.sub(r'http\S+', '',cleantext)
    return cleantext

In [25]:
def clean(sentences):
    cleaned_sentences = []
    for sentence in notebook.tqdm(sentences):
        cleaned_sentences.append(preprocess(sentence))
    return cleaned_sentences

In [26]:
text = clean(comments)
text[:5]

  0%|          | 0/622630 [00:00<?, ?it/s]

["it's more accurate to call it the m+ (1000) because the price is closer than calling it the x (10).",
 'to be there with a samsung phone\\n😂😂😂',
 'thank gosh, a place i can watch it without having to be at hd... my speed doesn’t support hd',
 'what happened to the home button on the iphone x? *****cough****copying samsung******cough',
 'power is the disease.\xa0 care is the cure.\xa0 keep caring for yourself and others as best as you can.\xa0 this is life.']

In [27]:
%%time
inputs = tokenizer(text, return_tensors='pt', max_length=256, truncation=True, padding='max_length')

CPU times: user 9min 31s, sys: 4.64 s, total: 9min 36s
Wall time: 9min 37s


In [28]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [29]:
inputs.input_ids.shape

torch.Size([622630, 256])

In [30]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [31]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [32]:
selection = []

for i in notebook.tqdm(range(inputs.input_ids.shape[0])):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

  0%|          | 0/622630 [00:00<?, ?it/s]

In [33]:
selection[:5]

[[9, 14, 16, 17, 19, 20, 26, 27, 28],
 [8],
 [1, 10, 16],
 [1, 6, 9, 15, 18],
 [9, 11, 20, 23, 26]]

In [34]:
for i in notebook.tqdm(range(inputs.input_ids.shape[0])):
    inputs.input_ids[i, selection[i]] = 103

  0%|          | 0/622630 [00:00<?, ?it/s]

In [35]:
inputs.input_ids[:5]

tensor([[ 101, 2009, 1005,  ...,    0,    0,    0],
        [ 101, 2000, 2022,  ...,    0,    0,    0],
        [ 101,  103, 2175,  ...,    0,    0,    0],
        [ 101,  103, 3047,  ...,    0,    0,    0],
        [ 101, 2373, 2003,  ...,    0,    0,    0]])

In [36]:
class CommentsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [37]:
dataset = CommentsDataset(inputs)

In [38]:
loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [39]:
model.to(device)
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [40]:
from transformers import AdamW
optim = AdamW(model.parameters(), lr=5e-5)



In [41]:
epochs = 2
for epoch in range(epochs):
    loop = notebook.tqdm(loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/19458 [00:00<?, ?it/s]

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  0%|          | 0/19458 [00:00<?, ?it/s]

In [42]:
torch.save(model.state_dict(), 'bert-base-uncased-youtube-comments.pth')