In [None]:
# Install dependencies
!pip install transformers datasets
!pip install --upgrade transformers




In [None]:
from google.colab import files
uploaded = files.upload()

# Load dataset
with open("motivational_speeches.txt", "r", encoding="utf-8") as f:
    data = f.read().split("\n\n")  # Separate paragraphs

# Show some examples
print(data[:3])

# Prepare Hugging Face Dataset
from datasets import Dataset
dataset = Dataset.from_dict({"text": data})
dataset = dataset.train_test_split(test_size=0.1)

# Load GPT-2 and tokenizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# Tokenize dataset (FIXED - now includes labels)
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()  # Critical for loss calculation
    return tokenized

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Configure W&B (Optional - remove if not needed)
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable if you don't want logging

# Training Arguments (Updated)
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    run_name="gpt2-motivational",  # Custom run name for W&B
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="wandb" if os.environ.get("WANDB_DISABLED") != "true" else None,  # Auto-handles W&B
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Train the model
trainer.train()

# Generate motivational text
from transformers import pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

results = generator(
    "You have the power to",
    max_length=80,
    num_return_sequences=3,
    do_sample=True,
    temperature=0.9,
    top_p=0.95
)

for i, r in enumerate(results):
    print(f"Text {i+1}: {r['generated_text']}\n")


# Save the model
model.save_pretrained("./motivational-model")
tokenizer.save_pretrained("./motivational-model")



Saving motivational_speeches.txt to motivational_speeches (5).txt
['Believe in yourself and all that you are. Know that there is something inside you that is greater than any obstacle.', 'Success is not final, failure is not fatal: It is the courage to continue that counts.', "Your time is limited, so don't waste it living someone else's life. Don't be trapped by dogma – which is living with the results of other people's thinking."]


Map:   0%|          | 0/27 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,2.3444,0.382211
2,0.4142,0.348856
3,0.3098,0.338376


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Text 1: You have the power to create and maintain life."

Text 2: You have the power to change.

Text 3: You have the power to do something, you have the motivation to do something.



('./motivational-model/tokenizer_config.json',
 './motivational-model/special_tokens_map.json',
 './motivational-model/vocab.json',
 './motivational-model/merges.txt',
 './motivational-model/added_tokens.json')