In [None]:
from transformers import BertTokenizer
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import BertConfig, BertForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the pretrained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define your MLM dataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="axioned_handbook.txt",
    block_size=128,
)

# Define data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Configure BERT model
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
)

# Create the BERT model for MLM
model = BertForMaskedLM(config=config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./models",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:

# Train the model
trainer.train()
model.save_pretrained("./models/axioned_handbook_model")
tokenizer.save_pretrained("./models/axioned_handbook_model")

In [None]:

# Load the pretrained model for sequence classification
model_path = "./model/axioned_handbook_model"  # Correct the path
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)

# Example text for classification
prompt = "2 days a week in office"
input_ids = loaded_tokenizer.encode(prompt, return_tensors="pt")

# Generate text by sampling from the model
max_length = 10  # Adjust as needed
output = loaded_model.generate(input_ids, max_length=max_length, num_return_sequences=1)

generated_text = loaded_tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:", generated_text)


In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True)
input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]

outputs = model(input_ids, labels=input_ids)
loss = outputs.loss
prediction_logits = outputs.logits