In [None]:
# Prepare dataset for fine-tuning
from pathlib import Path

input_path = "/content/cleaned_text.txt"
output_path = "/content/training_data.txt"

# Load and format the dataset
def prepare_dataset(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        text = f.read()

    questions = [
        "What is the main idea of the content?",
        "Explain the key points from the text.",
        "Provide a summary of the information.",
        # Add more questions here based on your context
    ]

    with open(output_file, "w", encoding="utf-8") as f:
        for question in questions:
            f.write(f"Question: {question}\nAnswer: {text}\n\n")

prepare_dataset(input_path, output_path)
print(f"Training data saved to {output_path}")


Training data saved to /content/training_data.txt


In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
# from datasets import load_dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the training data
dataset = load_dataset("text", data_files={"/content/training_data.txt"})["train"]

def tokenize_function(examples):
    inputs = tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=1024
    )
    # Assign input_ids as labels
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Define the model and resize embeddings to account for the new token
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/fine_tuned_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    # per_device_eval_batch_size=32,
    save_steps=1000,
    save_total_limit=2,
    logging_dir="/content/logs",
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
model.save_pretrained("/content/fine_tuned_gpt2")
tokenizer.save_pretrained("/content/fine_tuned_gpt2")


('/content/fine_tuned_gpt2/tokenizer_config.json',
 '/content/fine_tuned_gpt2/special_tokens_map.json',
 '/content/fine_tuned_gpt2/vocab.json',
 '/content/fine_tuned_gpt2/merges.txt',
 '/content/fine_tuned_gpt2/added_tokens.json',
 '/content/fine_tuned_gpt2/tokenizer.json')

In [None]:
from transformers import pipeline

# Load the fine-tuned model
qa_pipeline = pipeline("text-generation", model="/content/fine_tuned_gpt2", tokenizer="/content/fine_tuned_gpt2")

# Ask a question
question = "According to Bentham “every law may be considered in eight different aspects”. Discuss."
input_text = f"Question: {question}\nAnswer:"
output = qa_pipeline(input_text, max_length=500, num_return_sequences=1)
print(output[0]['generated_text'])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Question: According to Bentham “every law may be considered in eight different aspects”. Discuss.
Answer: I think that law cannot be regarded as a language. The knowledge cannot be regarded as an object of knowledge either for any one particular legislator or for a group of legislators. I know that many legislators are members of political parties but I have not been able to discover any instances of them working for any one party. After having read these teachings, I find myself inclined to believe in principle that the teachings are true “with the exception of Article IX, Section 4, Clause 1, Clause 3. After a while I discover that various views are expressed in various passages which I believe may not be true. I have heard of the case of the Minister of Justice and of the Secretary General of the European Communities who denied that legislation for the purpose of establishing an electoral college existed. This Court in the present case has held that the Constitution cannot be regard