In [None]:
%cd /gxfs_work/cau/sunms534/trading_bot/

In [None]:
from pathlib import Path

import pandas as pd
from tokenizers import ByteLevelBPETokenizer
from transformers import AutoTokenizer

from tokenizers.processors import RobertaProcessing
from src.config import config
import torch

In [None]:
bodies = pd.read_parquet(config.data.news.stripped, columns=["parsed_body"])

In [None]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

tokenizer.train_from_iterator(bodies.parsed_body.values, 
                              vocab_size=30000, 
                              min_frequency=10, 
                              special_tokens=[
                                "<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>",
                              ])

In [None]:
tokenizer.post_processor = RobertaProcessing(
    cls=("<s>", tokenizer.token_to_id("<s>")),
    sep=("</s>", tokenizer.token_to_id("</s>")),
)
tokenizer.enable_truncation(max_length=256)

In [None]:
# Save files to disk
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer._tokenizer, pad_token="<pad>", truncation=True)
tokenizer.save_pretrained("data/models/newstokenizer")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("data/models/newstokenizer", max_len=256)

In [None]:
text = bodies.iloc[10].parsed_body

In [None]:
encoding = tokenizer(
    text, 
    add_special_tokens = True, 
    truncation = True, 
    padding = "max_length", 
    max_length = 256,
    return_attention_mask = True, 
    return_tensors = "pt"
)

In [None]:
encoding["input_ids"]

In [None]:
encoding["attention_mask"]

In [None]:
title_inputs_ids = pd.read_parquet(config.data.news.input_ids)
masks = pd.read_parquet(config.data.news.masks)

In [None]:
from transformers import RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaTokenizerFast
from transformers import DataCollatorForLanguageModeling

configuration = RobertaConfig(vocab_size = 30000,
                              hidden_size = 256,
                              num_hidden_layers = 6,
                              num_attention_heads = 4,
                              intermediate_size = 1556,
                              hidden_act = 'gelu',
                              hidden_dropout_prob = 0.1,
                              attention_probs_dropout_prob = 0.1,
                              max_position_embeddings = 258,
                              type_vocab_size = 2,
                              initializer_range = 0.02,
                              layer_norm_eps = 1e-12,
                              pad_token_id = 1,
                              bos_token_id = 0,
                              eos_token_id = 2,
                              position_embedding_type = 'absolute',
                              use_cache = True,
                              classifier_dropout = None)


model = RobertaModel(configuration)

In [None]:
title_inputs_ids.iloc[0, :]

In [None]:
title_inputs_ids.iloc[1:5, :].values

In [None]:
model(input_ids=torch.tensor(title_inputs_ids.iloc[1:5, :].values), attention_mask=torch.tensor(masks.iloc[1:5, :].values))