#  Loading Dataset


In [1]:
! git clone https://huggingface.co/datasets/stevez80/Sci-Fi-Books-gutenberg

Cloning into 'Sci-Fi-Books-gutenberg'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 12 (delta 2), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (12/12), 4.11 KiB | 842.00 KiB/s, done.


In [2]:
%cd /content/Sci-Fi-Books-gutenberg

/content/Sci-Fi-Books-gutenberg


In [3]:
import pandas as pd
from datasets import Dataset

# Load the CSV file into a pandas DataFrame
df = pd.read_csv("/content/Sci-Fi-Books-gutenberg/sci-fi-books.csv")

# Convert the pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split the dataset
dataset = dataset.train_test_split(test_size=0.2)

#   Tokenize Data

In [4]:
from transformers import AutoTokenizer
from datasets import Dataset


tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token




def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )


tokenized = dataset.map(tokenize_function, batched=True, remove_columns=["id", "title", "author", "text"])


print(tokenized)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/2683 [00:00<?, ? examples/s]

Map:   0%|          | 0/671 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2683
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 671
    })
})


#  Load Quantized GPT-2 Medium

In [5]:
from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(
    "gpt2-medium",
    device_map="auto",
    torch_dtype=torch.float16,
)

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

#    LoRA with PEFT

In [6]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model


model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.0

)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 4,325,376 || all params: 359,148,544 || trainable%: 1.2043




#  Training Setup

In [7]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./content",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    save_strategy="epoch",
    num_train_epochs=2,
    logging_dir="./logs",
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    remove_unused_columns=False,
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Start Training

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,0.6554
1000,0.5066
1500,0.4799
2000,0.4443
2500,0.4391
3000,0.4372
3500,0.4236
4000,0.4108
4500,0.413
5000,0.3984


TrainOutput(global_step=5366, training_loss=0.4568673474016449, metrics={'train_runtime': 816.0208, 'train_samples_per_second': 6.576, 'train_steps_per_second': 6.576, 'total_flos': 5054708947353600.0, 'train_loss': 0.4568673474016449, 'epoch': 2.0})

In [9]:
import os
os.environ["WANDB_DISABLED"] = "true"


# Evaluate on the test/validation set

In [10]:
from transformers import Trainer

# Load model (if not already in memory)
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    eval_dataset=tokenized["test"]
)

metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)


  trainer = Trainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Evaluation Metrics: {'eval_model_preparation_time': 0.0039, 'eval_runtime': 29.4894, 'eval_samples_per_second': 22.754, 'eval_steps_per_second': 2.848}


#  Save the fine-tuned model

In [11]:
model.save_pretrained("./finetuned-gpt2-medium")
tokenizer.save_pretrained("./finetuned-gpt2-medium")


('./finetuned-gpt2-medium/tokenizer_config.json',
 './finetuned-gpt2-medium/special_tokens_map.json',
 './finetuned-gpt2-medium/vocab.json',
 './finetuned-gpt2-medium/merges.txt',
 './finetuned-gpt2-medium/added_tokens.json',
 './finetuned-gpt2-medium/tokenizer.json')

#  Inference

In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("./finetuned-gpt2-medium")
tokenizer = AutoTokenizer.from_pretrained("./finetuned-gpt2-medium")
from transformers import pipeline

# Use text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text
prompt = "In a distant galaxy, a lady"
outputs = generator(prompt, max_length=100, num_return_sequences=1, temperature=0.8)

print(outputs[0]['generated_text'])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In a distant galaxy, a lady, the daughter of a prince, is called to the palace in an attempt to bring her husband back to civilization. Written by David P. Williams <davidp@lubris.com>
