In [19]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
import torch
import os
os.environ["WANDB_DISABLED"] = "true"

In [20]:
print("Using device:", "cuda" if torch.cuda.is_available() else "cpu")

Using device: cuda


In [21]:
# Load Dataset
dataset = load_dataset("json", data_files="all.json")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['link', 'score', 'part1', 'mature', 'author', 'part2'],
        num_rows: 1000
    })
})


In [22]:
# Load Tokenizer + Model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.eos_token_id



In [23]:
# Tokenization
def tokenize(batch):
    full_texts = [str(p1) + " " + str(p2) for p1, p2 in zip(batch["part1"], batch["part2"])]
    tokens = tokenizer(
        full_texts,
        truncation=True,
        padding="max_length",
        max_length=128
    )

    tokens["labels"] = tokens["input_ids"].copy()
    return tokens
tokenized_dataset = dataset.map(tokenize, batched=True)


In [24]:
#Training Arguments
training_args = TrainingArguments(
    output_dir="distilgpt2-trained",
    per_device_train_batch_size=2,
    num_train_epochs=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=20,
    save_steps=200,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [25]:
#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)

In [26]:
#Train
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
20,2.5502
40,1.745
60,1.9106
80,1.4078
100,2.2643
120,1.7655
140,1.6002
160,1.6598
180,1.9585
200,1.7154


TrainOutput(global_step=4000, training_loss=1.2361337320804595, metrics={'train_runtime': 514.1737, 'train_samples_per_second': 15.559, 'train_steps_per_second': 7.779, 'total_flos': 261296750592000.0, 'train_loss': 1.2361337320804595, 'epoch': 8.0})

In [27]:
#Save Model + Tokenizer
trainer.save_model("distilgpt2-finetuned")
tokenizer.save_pretrained("distilgpt2-finetuned")

print("Training completed successfully!")

Training completed successfully!


In [33]:
#Generate Text
from transformers import pipeline

generator_ft = pipeline(
    "text-generation",
    model="distilgpt2-finetuned",
    tokenizer="distilgpt2-finetuned",
    device=0
)
prompt = "Generate a joke :\n"


output = generator_ft(
    prompt,
    max_new_tokens=90,
    do_sample=True,
    top_k=90,
    top_p=0.9,
    temperature=1.0
)

print(output[0]["generated_text"])




Device set to use cuda:0


Generate a joke :
I was born into a family of factory workers. I was told that it was impossible to find an acceptable weight class.    A doctor told me that many young people were born into a family of factory workers and that it was impossible to find an acceptable weight class.    I learnt that many children did not get the correct answer.  


In [35]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

#  Original model
tokenizer_orig = AutoTokenizer.from_pretrained("distilgpt2")
model_orig = AutoModelForCausalLM.from_pretrained("distilgpt2")
generator_orig = pipeline("text-generation", model=model_orig, tokenizer=tokenizer_orig, device=0)

# Fine-tuned model
tokenizer_finetuned = AutoTokenizer.from_pretrained("distilgpt2-finetuned")
model_finetuned = AutoModelForCausalLM.from_pretrained("distilgpt2-finetuned")
generator_finetuned = pipeline("text-generation", model=model_finetuned, tokenizer=tokenizer_finetuned, device=0)

# Prompts
prompts = [
    "Tell me a joke:"
]

# Compare
print("=== Original DistilGPT-2 Responses ===")
for prompt in prompts:
    output = generator_orig(prompt, max_new_tokens=50, do_sample=True, top_k=50)
    print(f"Prompt: {prompt}\nResponse: {output[0]['generated_text']}\n")

print("=== Fine-tuned DistilGPT-2 Responses ===")
for prompt in prompts:
    output = generator_finetuned(prompt, max_new_tokens=50, do_sample=True, top_k=50)
    print(f"Prompt: {prompt}\nResponse: {output[0]['generated_text']}\n")


Device set to use cuda:0
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


=== Original DistilGPT-2 Responses ===
Prompt: Tell me a joke:
Response: Tell me a joke: I'm a joke.

So, this is what you're doing. I've been in the public eye for years, and I'm trying to figure out what it takes to succeed at my job. I've had a fairly strong job for

=== Fine-tuned DistilGPT-2 Responses ===
Prompt: Tell me a joke:
Response: Tell me a joke: How did a frog learn how to swim?  frog: What frog did it tell him?  frog:  frog: No, no one knows how to swim.  frog: Why?  frog: Why?  frog: Why? 

