In [18]:
# load transformers and pandas
file_path = './artwar.1b.txt'

with open(file_path, 'r') as f:
    book_text = f.read()
    book_text = book_text.lower()
    
book_text

'the art of war\nby sun tzu\n\n\ntranslated by lionel giles\n\ni. laying plans\n\n1. sun tzu said: the art of war is of vital importance to the state.\n\n2. it is a matter of life and death, a road either to safety or to\nruin. hence it is a subject of inquiry which can on no account be\nneglected. \n\n3. the art of war, then, is governed by five constant factors, to\nbe taken into account in one\'s deliberations, when seeking to determine\nthe conditions obtaining in the field. \n\n4. these are: (1) the moral law; (2) heaven; (3) earth; (4) the commander;\n(5) method and discipline. \n\n5,6. the moral law causes the people to be in complete accord with\ntheir ruler, so that they will follow him regardless of their lives,\nundismayed by any danger. \n\n7. heaven signifies night and day, cold and heat, times and seasons.\n\n8. earth comprises distances, great and small; danger and security;\nopen ground and narrow passes; the chances of life and death.\n\n9. the commander stands for the

In [26]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')  # Make sure to use the correct model name
model = AutoModelForCausalLM.from_pretrained('gpt2')

# Set the padding token if not already defined
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize the text
tokens = tokenizer(book_text, truncation=True, padding='max_length', return_tensors="pt")

# Make sure to set the padding token
tokens['labels'] = tokens['input_ids'].detach().clone()

# Convert the dictionary of tensors into a Hugging Face Dataset
dataset = Dataset.from_dict(tokens)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Reduced batch size
    gradient_accumulation_steps=2,  # Use gradient accumulation
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    no_cuda=True,  # Use this line to force training on the CPU
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    # evaluation_dataset=val_dataset,  # Add your validation dataset if you have one
)

# Train the model
trainer.train()

# Save the model and the tokenizer
model.save_pretrained('./my_art_of_war_model')
tokenizer.save_pretrained('./my_art_of_war_model')

Using pad_token, but it is not set yet.


  0%|          | 0/3 [00:00<?, ?it/s]

{'train_runtime': 47.5225, 'train_samples_per_second': 0.063, 'train_steps_per_second': 0.063, 'train_loss': 1.8302113215128581, 'epoch': 3.0}


('./my_art_of_war_model/tokenizer_config.json',
 './my_art_of_war_model/special_tokens_map.json',
 './my_art_of_war_model/vocab.json',
 './my_art_of_war_model/merges.txt',
 './my_art_of_war_model/added_tokens.json',
 './my_art_of_war_model/tokenizer.json')

In [32]:
from transformers import pipeline, set_seed

# Load your fine-tuned model
model_path = './my_art_of_war_model'
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create a text generation pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Prompt the model with a starting sentence
prompt = "Local spies are according to Sun Tzu"

# Generate text
set_seed(42)  # Set a random seed for reproducibility
generated_text = generator(prompt, max_length=100, num_return_sequences=1)

print(generated_text[0]['generated_text'])


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Local spies are getting closer to destroying you."

Kuroshirai watched as the pair of them walked back to the building and were waiting for the police to arrive.

The police then began to arrest them as well, but were prevented from even contacting any of them over the telephone. Despite only having made up four words to them, each of the policemen took their time explaining to the others what this incident was all about, and how the police were now in the middle of getting
