In [2]:
pip install datasets



In [3]:
pip install transformers




In [4]:
pip install tokenizers



In [5]:
pip install pysentimiento



In [6]:
from pysentimiento.preprocessing import preprocess_tweet

preprocess_tweet('RT @bellahtyrah: I love these tears😂😂😂TACHA OUR BITCOIN#GodMadeTacha' , lang='en')

'RT @USER: I love these tears emoji face with tears of joy emoji  emoji face with tears of joy emoji  emoji face with tears of joy emoji TACHA OUR BITCOIN#GodMadeTacha'

In [7]:
from datasets import load_dataset
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/mamin.csv',lineterminator='\n')
#dataset = load_dataset('csv', data_files='/content/drive/MyDrive/mamin3.csv')

In [8]:
from tqdm.auto import tqdm  # for our loading bar

text_data = []
file_count = 0

for sample in tqdm(df['text']):
    # remove newline characters from each sample as we need to use exclusively as seperators
    sample = sample.replace('\n', '')
    sample = sample.replace('\r', '')
    text_data.append(sample)
    if len(text_data) == 10_000:
        # once we hit the 5K mark, save to file
        with open(f'/content/drive/MyDrive/text/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
        text_data = []
        file_count += 1
 #after saving in 5K chunks, we will have ~3808 leftover samples, we save those now too
with open(f'/content/drive/MyDrive/text/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
    fp.write('\n'.join(text_data))


  0%|          | 0/2100000 [00:00<?, ?it/s]

In [9]:
from pathlib import Path
paths = [str(x) for x in Path('/content/drive/MyDrive/text').glob('**/*.txt')]


In [10]:
from tokenizers import ByteLevelBPETokenizer
# initialize
tokenizer = ByteLevelBPETokenizer()
# and train
tokenizer.train(files=paths[0:2], vocab_size=30_522, min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])


In [11]:
#import os

#os.mkdir('/content/drive/MyDrive/text/token1')

tokenizer.save_model('/content/drive/MyDrive/text/token')

['/content/drive/MyDrive/text/token/vocab.json',
 '/content/drive/MyDrive/text/token/merges.txt']

In [12]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('/content/drive/MyDrive/text/token', max_len=512)

In [13]:
# Testing the tokenizer

test = 'RT @bellahtyrah: I love these tears😂😂😂TACHA OUR BITCOIN#GodMadeTacha'
tokenizer(test, max_length=15, padding='max_length', truncation=True)


{'input_ids': [0, 297, 269, 23615, 457, 93, 5343, 30, 318, 1770, 1195, 26180, 9017, 2942, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
import torch

def mlm(tensor):
  rand = torch.rand(tensor.shape)
  mask_arr = (rand < 0.15) * (tensor > 2)
  for i in range(tensor.shape[0]):
    selection = torch.flatten(mask_arr[i].nonzero()).tolist()
    tensor[i,selection] = 4
  return(tensor)

In [36]:
from tqdm.auto import tqdm

input_ids = []
mask = []
labels = []

for path in tqdm(paths[:5]):
    with open(path, 'r', encoding='utf-8') as fp:
      lines = fp.read().split('\n')
    sample = tokenizer(lines, max_length=512, padding='max_length', truncation=True, return_tensors='pt') 
    labels.append(sample.input_ids)
    mask.append(sample.attention_mask)
    input_ids.append(mlm(sample.input_ids).clone())

  0%|          | 0/5 [00:00<?, ?it/s]

In [37]:
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [38]:
encodings = {
    'input_ids': input_ids,
    'attention_mask': mask,
    'labels': labels
}

In [39]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        # store encodings internally
        self.encodings = encodings

    def __len__(self):
        # return the number of samples
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        # return dictionary of input_ids, attention_mask, and labels for index i
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [40]:
dataset = Dataset(encodings)

In [41]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [42]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=30_522,  # we align this to the tokenizer vocab_size
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [43]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config)

In [44]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [45]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=1e-4)



In [47]:
epochs = 1

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/3125 [00:00<?, ?it/s]

RuntimeError: ignored