In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import tensorflow as tf

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # Detect TPU
    # Check if TPU is detected and cluster_spec is not None
    if tpu.cluster_spec() is not None:
        cluster_spec_dict = tpu.cluster_spec().as_dict()
        # Check if 'worker' key exists before accessing it
        if 'worker' in cluster_spec_dict:
            print('Running on TPU:', cluster_spec_dict['worker'])
        else:
            print('TPU detected but "worker" key not found in cluster spec.')
    else:
        print('No TPU cluster spec found.')
except ValueError:
    print('No TPU found. Please check your runtime settings.')

No TPU found. Please check your runtime settings.


In [4]:
import tensorflow as tf
from transformers import GPT2Tokenizer
from tqdm import tqdm

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def load_tf_dataset(file_path, tokenizer, block_size=128, chunk_size=1000000):
    def chunk_generator():
        with open(file_path, 'r', encoding='utf-8') as f:
            while True:
                chunk = f.read(chunk_size)
                if not chunk:
                    break
                yield chunk

    def process_chunk(chunk):
        tokens = tokenizer.tokenize(chunk)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        for i in range(0, len(ids) - block_size + 1, block_size):
            yield {
                "input_ids": ids[i:i+block_size],
                "attention_mask": [1] * block_size
            }

    def gen():
        for chunk in tqdm(chunk_generator(), desc="Processing chunks"):
            yield from process_chunk(chunk)

    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {
                "input_ids": tf.TensorSpec(shape=(block_size,), dtype=tf.int32),
                "attention_mask": tf.TensorSpec(shape=(block_size,), dtype=tf.int32),
            }
        )
    )

# Load the dataset
train_dataset = load_tf_dataset('/content/drive/MyDrive/midi_text_data.txt', tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [5]:
import os

# Path to the file
file_path = '/content/drive/MyDrive/midi_text_data.txt'

# Get the size of the file in bytes
file_size = os.path.getsize(file_path)

# Convert bytes to MB for easier reading
file_size_mb = file_size / (1024 * 1024)

print(f"File size: {file_size_mb:.2f} MB")


File size: 2751.81 MB


In [6]:
# Load the dataset
train_dataset = load_tf_dataset('/content/drive/MyDrive/midi_text_data.txt', tokenizer)

In [7]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, TextDataset, Trainer, TrainingArguments


In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [11]:
from datasets import load_dataset as hf_load_dataset

def tokenize_dataset(file_path, tokenizer, block_size=128):
    """Tokenizes a text dataset.

    Args:
        file_path: The path to the text file.
        tokenizer: The tokenizer to use.
        block_size: The size of the blocks to use for tokenization.

    Returns:
        A tokenized dataset.
    """

    # Load the text file as a dataset
    # Use hf_load_dataset instead of load_dataset
    # Specify the encoding as 'latin-1'
    dataset = hf_load_dataset('text', data_files={'train': file_path}, split='train', encoding='latin-1')

    # Make sure the tokenizer has a padding token explicitly set before calling it
    # Use eos_token instead of eos_
    tokenizer.pad_token = tokenizer.eos_token

    def preprocess_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=block_size)

    # Tokenize the dataset
    # Access column_names directly from dataset
    tokenized_datasets = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset.column_names, # Changed from dataset["train"].column_names to dataset.column_names
    )
    return tokenized_datasets

In [12]:
train_dataset = tokenize_dataset(file_path, tokenizer)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/41806395 [00:00<?, ? examples/s]

In [13]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [14]:
def get_dataset_length(dataset):
  """Calculates the length of a tf.data.Dataset."""
  count = 0
  for _ in dataset:
    count += 1
  return count

train_dataset_length = get_dataset_length(train_dataset)

In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    max_steps=1000,  # Add this line to specify the maximum number of training steps
    dataloader_drop_last=True # drop the last incomplete batch
)

In [16]:
train_dataset = train_dataset.take(train_dataset_length)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

max_steps is given, it will override any value given in num_train_epochs


In [18]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,3.167
1000,2.8342


TrainOutput(global_step=1000, training_loss=3.000562744140625, metrics={'train_runtime': 6014.7151, 'train_samples_per_second': 0.333, 'train_steps_per_second': 0.166, 'total_flos': 130646016000000.0, 'train_loss': 3.000562744140625, 'epoch': 4.7839572099904147e-05})

In [19]:
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

('./fine_tuned_gpt2/tokenizer_config.json',
 './fine_tuned_gpt2/special_tokens_map.json',
 './fine_tuned_gpt2/vocab.json',
 './fine_tuned_gpt2/merges.txt',
 './fine_tuned_gpt2/added_tokens.json')

In [20]:
import shutil

# Define the path for saving model and tokenizer
model_dir = './fine_tuned_gpt2'

# Save the model and tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

# Zip the model directory
shutil.make_archive('fine_tuned_gpt2', 'zip', model_dir)

# Copy the zipped model file to Google Drive
!cp fine_tuned_gpt2.zip /content/drive/MyDrive/
print("Model saved to Google Drive as fine_tuned_gpt2.zip")


Model saved to Google Drive as fine_tuned_gpt2.zip


In [22]:
from google.colab import files

# Download the zipped file to your local system
files.download('fine_tuned_gpt2.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
from IPython.display import FileLink

# Create a download link
FileLink('/content/fine_tuned_gpt2.zip')


In [28]:
!pip install onnx

Collecting onnx
  Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx
Successfully installed onnx-1.17.0


In [29]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_gpt2')
model.eval()  # Set the model to evaluation mode

# Define a dummy input for tracing (e.g., a sequence of token IDs)
dummy_input = tokenizer.encode("Sample input text", return_tensors="pt")

# Export the model to ONNX format
# Changed opset_version from 12 to 14
torch.onnx.export(
    model,                          # The model being converted
    dummy_input,                    # Dummy input for tracing
    "fine_tuned_gpt2.onnx",         # Output file name
    input_names=["input_ids"],      # Define input layer name(s)
    output_names=["output"],        # Define output layer name(s)
    dynamic_axes={"input_ids": {0: "batch_size", 1: "sequence_length"},
                  "output": {0: "batch_size", 1: "sequence_length"}},
    opset_version=14                # Changed to opset 14 for compatibility
)

print("Model has been saved as fine_tuned_gpt2.onnx")

Model has been saved as fine_tuned_gpt2.onnx


In [30]:
# Ensure your Google Drive is already mounted
from google.colab import drive
drive.mount('/content/drive')

# Copy the ONNX model file to Google Drive
!cp fine_tuned_gpt2.onnx /content/drive/MyDrive/

print("Model has been successfully saved to Google Drive in the MyDrive folder.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model has been successfully saved to Google Drive in the MyDrive folder.
