In [None]:
import re
import json
import torch

from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelWithLMHead, pipeline

from sklearn.model_selection import train_test_split

In [None]:
token_pretrained = "gpt2"
model_pretrained = "robowaifudev/megatron-gpt2-345m"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(token_pretrained)

train_path = 'clean-data/clean-4-14-23.csv'

In [None]:
def load_dataset(train_path, tokenizer):
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=64,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, data_collator

In [None]:
train_dataset, data_collator = load_dataset(train_path, tokenizer)

In [None]:
torch.backends.cuda.matmul.allow_tf32 = True

model = AutoModelWithLMHead.from_pretrained(model_pretrained, torch_dtype=torch.float32)

training_args = TrainingArguments(
    output_dir="./model-output",
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=12,
    save_steps=800,
    tf32=True,
    warmup_steps=500)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset)

In [None]:
trainer.train()

In [None]:
input_text = 'i am going to double your'

generation_length = len(input_text.split()) * 2

input_text = tokenizer.encode(input_text, return_tensors='pt').to("cuda")

print(generation_length)

response = model.generate(
    input_ids=input_text, 
    max_length=generation_length, 
    do_sample=True, 
    output_scores=True,
    top_k=50, 
    top_p=0.95, 
    temperature=0.9, 
    num_return_sequences=1)

response = tokenizer.decode(response[0], skip_special_tokens=True)

print(f"\033[91m{response}\033[00m")