# Train a tokenizer

In [1]:
import pandas as pd

all_data = pd.read_csv("../lstm_lo_steps_prediction/data/steps_simple_term_str.csv", delimiter=',')

all_data = pd.DataFrame({"simple_terms": all_data["simple_terms"]})
all_data.to_csv("./fine_models/tokenizer_train_data.txt", header=False, index=False)

In [2]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("./fine_models/.").glob("*.txt")]

# # Initialize a tokenizer
# tokenizer = ByteLevelBPETokenizer()
#
# # Customize training
# tokenizer.train(files=paths, vocab_size=4, min_frequency=100, special_tokens=[
#     "(",
#     ")",
#     "@x.",
#     "x",
#     "<s>",
#     "</s>",
# ])
#
# # Save files to disk
# tokenizer.save_model("./fine_models/", "sstr-term-bert")

In [3]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./fine_models/vocab.json",
    "./fine_models/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [4]:
tok_encs = tokenizer.encode("(@x. (x (@x. ((x (@x. (@x. x))) x))))".replace("@x.", "y"))
print(f"ids: {tok_encs.ids}")
print(f"type_ids: {tok_encs.type_ids}")
print(f"tokens: {tok_encs.tokens}")
print(f"offsets: {tok_encs.offsets}")
print(f"attention_mask: {tok_encs.attention_mask}")
print(f"special_tokens_mask: {tok_encs.special_tokens_mask}")
print(f"overflowing: {tok_encs.overflowing}")

ids: [6, 1, 3, 1, 4, 1, 3, 1, 1, 4, 1, 3, 1, 3, 4, 2, 2, 2, 4, 2, 2, 2, 2, 7]
type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
tokens: ['<s>', '(', 'y', '(', 'x', '(', 'y', '(', '(', 'x', '(', 'y', '(', 'y', 'x', ')', ')', ')', 'x', ')', ')', ')', ')', '</s>']
offsets: [(0, 0), (0, 1), (1, 2), (2, 3), (4, 5), (5, 6), (7, 8), (8, 9), (8, 9), (11, 12), (12, 13), (14, 15), (15, 16), (17, 18), (18, 19), (20, 21), (21, 22), (22, 23), (23, 24), (25, 26), (26, 27), (27, 28), (28, 29), (0, 0)]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
special_tokens_mask: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
overflowing: []


# Train a language model

In [5]:
# Check that we have a GPU
# !nvidia-smi

Wed Oct  4 15:38:46 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 531.68                 Driver Version: 531.68       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                      TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 L...  WDDM | 00000000:01:00.0  On |                  N/A |
| N/A   40C    P8               14W /  N/A|    379MiB /  6144MiB |      6%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [6]:
# Check that PyTorch sees it
import torch

print(torch.cuda.is_available())
print(torch.version.cuda)

True
12.1


In [7]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=10,
    hidden_size=123,
    intermediate_size=128,
    max_position_embeddings=512,
    num_attention_heads=3,
    num_hidden_layers=3,
    type_vocab_size=1,
    classifier_dropout=0.1,
    num_labels=31,
)

# RobertaConfigs:

# vocab_size: Any = 50265,
# hidden_size: Any = 768,
# num_hidden_layers: Any = 12,
# num_attention_heads: Any = 12,
# intermediate_size: Any = 3072,
# hidden_act: Any = "gelu",
# hidden_dropout_prob: Any = 0.1,
# attention_probs_dropout_prob: Any = 0.1,
# max_position_embeddings: Any = 512,
# type_vocab_size: Any = 2,
# initializer_range: Any = 0.02,
# layer_norm_eps: Any = 1e-12,
# pad_token_id: int = 1,
# bos_token_id: int = 0,
# eos_token_id: int = 2,
# position_embedding_type: Any = "absolute",
# use_cache: Any = True,
# classifier_dropout: Any = None

In [8]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./fine_models", max_len=512)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification(config=config)

In [10]:
model.num_parameters()
# => 84 million parameters

363388

In [11]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from datasets import Dataset
import pandas as pd

In [13]:
all_data = pd.read_csv("../lstm_lo_steps_prediction/data/steps_simple_term_str.csv", delimiter=',')

# leave only unique terms
print(f"Count all terms: {len(all_data)}\n")
all_data = all_data.drop_duplicates(subset="simple_terms").reset_index(drop=True)
print(f"Count original terms: {len(all_data)}\n")

# shuffle the dataset
all_data = shuffle(all_data, random_state=33).reset_index(drop=True)

# filter out steps_lo that are more than 30
all_data = all_data[[steps_lo < 31 for steps_lo in all_data['steps_num_lo']]]

print(f"max steps count: {max(all_data['steps_num_lo'])}")
print(f"min steps count: {min(all_data['steps_num_lo'])}")

# make test/train split
x_train, x_test, y_train_, y_test_ = train_test_split(all_data["simple_terms"].tolist(),
                                                      all_data["steps_num_lo"].tolist(),
                                                      test_size=0.2, random_state=42)

print(f"Count training samples: {len(y_train_)}")
print(f"Count testing samples: {len(y_test_)}")

Count all terms: 4251

Count original terms: 4251

max steps count: 30
min steps count: 0
Count training samples: 2952
Count testing samples: 739


In [14]:
x_train = [x_.replace("@x.", "y") for x_ in x_train]
x_test = [x_.replace("@x.", "y") for x_ in x_test]

In [16]:
x_train[0]

'(((y x) (y (y x))) ((y (y ((x (y x)) x))) (y x)))'

In [17]:
train_df = pd.DataFrame({"term_str": x_train, "steps_lo": y_train_})
test_df = pd.DataFrame({"term_str": x_test, "steps_lo": y_test_})
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

def preprocess(example):
    # Tokenize the prompt
    tokenized_example = tokenizer(example['term_str'], truncation=True, padding='max_length', max_length=512)
    tokenized_example['label'] = example['steps_lo']
    return tokenized_example


tokenized_train_ds = train_ds.map(preprocess, batched=False,
                                  remove_columns=['term_str', 'steps_lo'])

tokenized_test_ds = test_ds.map(preprocess, batched=False,
                                remove_columns=['term_str', 'steps_lo'])

Map:   0%|          | 0/2952 [00:00<?, ? examples/s]

Map:   0%|          | 0/739 [00:00<?, ? examples/s]

In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./fine_models",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_train_ds,
)

In [20]:
print(len(tokenized_test_ds['input_ids'][0]))
print(tokenized_test_ds['input_ids'][0])

512
[6, 1, 1, 3, 4, 2, 1, 3, 1, 3, 1, 1, 3, 4, 2, 1, 3, 1, 4, 1, 1, 1, 3, 1, 4, 1, 1, 3, 1, 4, 1, 3, 4, 2, 2, 2, 1, 3, 4, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 4, 2, 2, 2, 2, 2, 4, 2, 2, 1, 1, 3, 1, 4, 1, 1, 3, 4, 2, 1, 3, 4, 2, 2, 2, 2, 1, 3, 1, 4, 1, 4, 1, 3, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [21]:
print(len(tokenized_test_ds['attention_mask'][0]))
print(tokenized_test_ds['attention_mask'][0])

512
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [22]:
train_log = trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`