## Load

In [1]:
url = 'https://www.songlyrics.com/ween-lyrics/'

import requests
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
from IPython.display import clear_output

resp = requests.get(url)
soup = bs(resp.text,"lxml")
songs = soup.find_all('tr')

for s in tqdm(songs):
    lyr = s.find('a')['href']

    try:
        resp = requests.get(lyr)
        allTxt = resp.text
        soup = bs(allTxt,"lxml")
        verses = soup.find_all('p',{'class':'songLyricsV14 iComment-text'})[0].get_text().replace('\n\n\n','\n\n').split('\n\n')
        for v in verses:
            if v != '':
                v = v.replace('\n\n','\n')
                clear_output(wait=True)
                display(v)
                with open(f'weenLyricsHOLDING.txt', 'a', encoding="utf-8") as f:
                    f.write(f'{v}[split]')
    except:
        pass

"I love you even if you don't\nYou've got your knife up to my throat\nWhy do you want to see me bleed?"

100%|██████████| 402/402 [03:27<00:00,  1.94it/s]


In [37]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load models
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Load and tokenize text
with open('weenLyrics.txt','r') as f:
    text = f.read().split('[split]')
inputs = tokenizer(text, max_length=128, truncation=True, padding='max_length', return_tensors='pt')
inputs['labels'] = inputs['input_ids'].detach().clone()

# Randomly select 15% of tokens to mask
rand = torch.rand(inputs['input_ids'].shape)
maskArr = (rand < 0.15) * (inputs['input_ids'] != 101) * (inputs['input_ids'] != 102) * (inputs['input_ids'] != 0)

# Get indices of masked tokens
maskIndices = []
for i in range(maskArr.shape[0]):
    maskIndices.append(torch.flatten(maskArr[i].nonzero()).tolist())

# Mask the tokens
for i in range(maskArr.shape[0]):
    inputs['input_ids'][i, maskIndices[i]] = 103

# Create dataset class
class WeenSet(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: val[idx].detach().clone() for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings['input_ids'])

dataset = WeenSet(inputs)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)    

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Train

In [38]:
from tqdm import tqdm
from matplotlib import pyplot as plt 
from IPython.display import clear_output

device = torch.device('cuda')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

epochs = 4
losses = []
for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        inputIds = batch['input_ids'].to(device)
        attentionMask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=inputIds, attention_mask=attentionMask, labels=labels)
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

model.save_pretrained(r"E:\Models\bert-finetuned\weenbert-2")

Epoch 0: 100%|██████████| 544/544 [01:22<00:00,  6.61it/s, loss=0.297] 
Epoch 1: 100%|██████████| 544/544 [01:17<00:00,  6.99it/s, loss=0.0928] 
Epoch 2: 100%|██████████| 544/544 [01:22<00:00,  6.57it/s, loss=0.052]  
Epoch 3: 100%|██████████| 544/544 [01:22<00:00,  6.57it/s, loss=0.0487] 


## Test

In [39]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

modelPath = r"E:\Models\bert-finetuned\weenbert-2"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained(modelPath)
baseModel = BertForMaskedLM.from_pretrained('bert-base-uncased')

device = torch.device('cuda')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
import random
from tqdm import tqdm

from IPython.display import clear_output, Markdown

with open('weenlyrics.txt','r') as f:
    text = f.read().split('[split]')

text = list(filter(lambda v: 'We do not have the lyrics for' not in v,text))

tests = 5000
baseCorr = []
finetuneCorr = []

for i in tqdm(range(tests)):
    lyrIdx = round(random.random() * len(text)-1)
    verse = text[lyrIdx]
    splitText = verse.replace('\n',' \n ').replace(',',' , ').replace('.',' . ').replace('"',' " ').replace('(',' ( ').replace(')',' ) ').split(' ')[:60]
    maskVal = '.'

    while maskVal == ',' or maskVal == '.' or maskVal == '\n' or maskVal == '' or len(maskVal)<4 or len(splitText)<2 or 'We do not have the lyrics' in verse:
        lyrIdx = round(random.random() * len(text)-1)
        verse = text[lyrIdx]
        splitText = verse.replace('\n',' \n ').replace(',',' , ').replace('.',' . ').replace(' " ','"').replace(' ( ','(').replace(' ) ',')').split(' ')[:60]
        maskIdx = round(random.random() * len(splitText)-1)
        maskVal = splitText[maskIdx].replace('"','').replace('.','').replace(',','')

    splitText[maskIdx] = '[MASK]'
    prompt = ' '.join(splitText).replace(' \n ','\n').replace(' , ',',').replace(' . ','.')

    encodings = tokenizer(prompt, max_length=128, truncation=True, padding='max_length', return_tensors='pt')
    inputIds = encodings['input_ids'].to(device)

    try:
        maskIdx = (inputIds == 103).flatten().nonzero().item()
        attentionMask = encodings['attention_mask'].to(device)

        # Base
        baseModel.to(device)
        outputs = baseModel(input_ids=inputIds, attention_mask=attentionMask)
        logits = outputs.logits
        soft = logits.softmax(dim=-1)
        arg = soft.argmax(dim=-1).view(-1)
        baseGuessVal = tokenizer.convert_ids_to_tokens(arg[maskIdx].item())

        # Finetuned
        model.to(device)
        outputs = model(input_ids=inputIds, attention_mask=attentionMask)
        logits = outputs.logits
        soft = logits.softmax(dim=-1)
        arg = soft.argmax(dim=-1).view(-1)
        guessVal = tokenizer.convert_ids_to_tokens(arg[maskIdx].item())
    except:
        guessVal = 'I dunno man...'
        baseGuessVal = 'I dunno man...'

    baseCorr.append(int(baseGuessVal==maskVal.lower()))
    finetuneCorr.append(int(guessVal==maskVal.lower()))

    #clear_output(wait=True)
    #print(f'Masked Verse: \n\n{prompt}\n')
    #print(f'Finetuned Guess: {guessVal}')
    #print(f'Base Guess: {baseGuessVal}')
    #print(f'Correct Answer: {maskVal}')

  0%|          | 0/5000 [00:00<?, ?it/s]

100%|██████████| 5000/5000 [02:34<00:00, 32.44it/s]


In [41]:
import numpy as np
print(np.array(baseCorr).mean())
print(np.array(finetuneCorr).mean())

0.3012
0.4052
