In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def load_dataset(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    tokenized_lines = [tokenizer.encode(line) for line in lines]
    return tokenized_lines

In [None]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="axioned_handbook.txt",
    block_size=128,
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    vocab_size=tokenizer.vocab_size,
)

model = BertForMaskedLM(config=config)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./models",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train()
model.save_pretrained("./models/axioned_handbook_model")  # Replace with your preferred directory
tokenizer.save_pretrained("./models/axioned_handbook_model")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "./model/axioned_handbook_model"
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)


In [None]:
text = "This is a sample text for classification."
inputs = loaded_tokenizer(text, return_tensors="pt")
outputs = loaded_model(**inputs)
logits = outputs.logits
predicted_class = logits.argmax().item()

In [None]:
import torch
probabilities = torch.nn.functional.softmax(output.logits, dim=-1)
predicted_class = torch.argmax(probabilities, dim=-1)


In [None]:
# from transformers import BertTokenizer, BertModel
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained("bert-base-uncased")
text = "Hello, how are you?"
encoded_input = loaded_tokenizer(text, return_tensors='pt')
output = loaded_model(**encoded_input)
print(output)