In [1]:
# Install the transformers and datasets libraries
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [4]:
import torch
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline,
    set_seed
)
from datasets import load_dataset

In [25]:
# Specify the pre-trained model name
model_name = 'gpt2'

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a new pad_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})



# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained(model_name)


# Resize the model's embeddings to accommodate the new token
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 768)

In [26]:
from datasets import load_dataset

# Load the DailyDialog dataset
dataset = load_dataset('daily_dialog',trust_remote_code=True)

In [27]:
# Print out the first example
print(dataset['train'][0])

{'dialog': ['Say , Jim , how about going for a few beers after dinner ? ', ' You know that is tempting but is really not good for our fitness . ', ' What do you mean ? It will help us to relax . ', " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ", " I guess you are right.But what shall we do ? I don't feel like sitting at home . ", ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ', " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ", ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ', " Good.Let ' s go now . ", ' All right . '], 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4], 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]}


In [28]:
# Concatenate the utterances in each dialogue
def concatenate_dialogues(example):
    return {'text': ' '.join(example['dialog'])}

# Apply the function to the dataset
concatenated_dataset = dataset.map(concatenate_dialogues, remove_columns=['dialog', 'act', 'emotion'])

In [29]:
# Tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

tokenized_datasets = concatenated_dataset.map(tokenize_function, batched=True, remove_columns=['text'])

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [30]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 11118
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1000
    })
})


In [31]:
print(tokenized_datasets['train'][0])

{'input_ids': [25515, 837, 5395, 837, 703, 546, 1016, 329, 257, 1178, 16800, 706, 8073, 5633, 220, 220, 921, 760, 326, 318, 29850, 475, 318, 1107, 407, 922, 329, 674, 13547, 764, 220, 220, 1867, 466, 345, 1612, 5633, 632, 481, 1037, 514, 284, 8960, 764, 220, 220, 2141, 345, 1107, 892, 523, 5633, 314, 836, 470, 764, 632, 481, 655, 787, 514, 3735, 290, 719, 14397, 764, 11436, 938, 640, 5633, 220, 220, 314, 4724, 345, 389, 826, 13, 1537, 644, 2236, 356, 466, 5633, 314, 836, 470, 1254, 588, 5586, 379, 1363, 764, 220, 220, 314, 1950, 257, 2513, 625, 284, 262, 11550, 810, 356, 460, 711, 33041, 506, 290, 1826, 617, 286, 674, 2460, 764, 220, 220, 1320, 338, 257, 922, 2126, 764, 314, 3285, 5335, 290, 25737, 1690, 467, 612, 284, 711, 29400, 79, 506, 13, 13710, 356, 460, 787, 257, 1440, 11246, 351, 606, 764, 220, 220, 27107, 1049, 284, 502, 5145, 1002, 484, 389, 4684, 837, 356, 714, 1265, 606, 284, 467, 15360, 351, 514, 13, 2504, 318, 6275, 5517, 290, 1257, 837, 1165, 764, 220, 220, 4599, 13, 575

In [32]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Calculate total length divisible by block_size
    total_length = len(concatenated_examples['input_ids'])
    total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size
    result = {
        k: [t[i:i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    #Setting these two equal because the model internally handles the shifting of the tokens
    result['labels'] = result['input_ids'].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)

Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [33]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    overwrite_output_dir=True,       # Overwrite the content of the output directory
    num_train_epochs=1,              # Number of training epochs
    per_device_train_batch_size=4,   # Batch size per device during training
    save_steps=10_000,               # Save checkpoint every 10,000 steps
    save_total_limit=2,              # Limit the total amount of checkpoints
    prediction_loss_only=True,       # Only calculate loss during evaluation
    logging_steps=500,               # Log every 500 steps
    report_to=["none"],
)

In [34]:
# Create a data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_datasets['train'],
)

In [35]:
# Start training
trainer.train()

Step,Training Loss
500,3.5412
1000,2.7974
1500,2.7226
2000,2.6955
2500,2.6723


TrainOutput(global_step=2780, training_loss=2.8612564416240445, metrics={'train_runtime': 495.3548, 'train_samples_per_second': 22.445, 'train_steps_per_second': 5.612, 'total_flos': 726261202944000.0, 'train_loss': 2.8612564416240445, 'epoch': 1.0})

In [36]:
# Save the fine-tuned model
trainer.save_model('./fine-tuned-model')

# Save the tokenizer
tokenizer.save_pretrained('./fine-tuned-model')

('./fine-tuned-model/tokenizer_config.json',
 './fine-tuned-model/special_tokens_map.json',
 './fine-tuned-model/vocab.json',
 './fine-tuned-model/merges.txt',
 './fine-tuned-model/added_tokens.json')

In [41]:
set_seed(42)

device = 0 if torch.cuda.is_available() else -1

# Initialize the original GPT-2 model
original_generator = pipeline(
    'text-generation',
    model='gpt2',
    tokenizer='gpt2',
    device=device
)

# Initialize the fine-tuned model
fine_tuned_generator = pipeline(
    'text-generation',
    model='./fine-tuned-model',
    tokenizer='./fine-tuned-model',
    device=device
)


# Provide a prompt to the model
prompt = "What do you think about"

# Generate text with the original GPT-2
original_generated = original_generator(prompt, max_length=100, num_return_sequences=1, truncation=True)

# Generate text with the fine-tuned model
fine_tuned_generated = fine_tuned_generator(prompt, max_length=100, num_return_sequences=1, truncation=True)

# Print the outputs
print("Original GPT-2 Output:")
print(original_generated[0]['generated_text'])

print("\nFine-Tuned GPT-2 Output:")
print(fine_tuned_generated[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original GPT-2 Output:
What do you think about these new discoveries that Dr. Sankara had suggested would lead to the end of human human involvement in space exploration?" (The New Yorker, 20 June 2015) That was also when Karpovsky decided to tell us more.

In 2006, he announced that "Science and the Universe — and the Universe in general" had reached the point where it could be used as a "scientific narrative." He was writing that space exploration — from where we are today — has

Fine-Tuned GPT-2 Output:
What do you think about taking your English class again? Will students find it more effective?   You can take this class, but you won't necessarily enjoy your credit. You'll only be able to take English classes.   Maybe I'll take classes like that. It's hard, actually. I know there's an English teacher and I want to try it on, but... Let's discuss next Monday.   Yeah, I'll wait for it, we'll get there at 20
