In [None]:
!pip install transformers datasets ray[default] ray[tune] ray[serve] boto3

In [262]:
%%writefile train_gpt2_script.py
import os
import logging
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer
import ray
from ray.train.huggingface.transformers import RayTrainReportCallback, prepare_trainer
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig, RunConfig
from datasets import load_dataset
import evaluate
import numpy as np

# Setup basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Variables
s3_bucket_name = "datasets-checkpoints20240423160619841200000004"
storage_path = f"s3://{s3_bucket_name}/checkpoints/"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token

def preprocess(examples):
    output = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    return {
        "input_ids": output.input_ids.long(),
        "attention_mask": output.attention_mask.long(),
        "labels": output.input_ids.clone()
    }

# Define the full training function
def train_func():
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

    # Evaluation Metrics
    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        # Ensure that predictions and labels are numpy arrays of type int32
        predictions = predictions.flatten().astype(np.int32)  # Flatten and convert to int32
        labels = labels.flatten().astype(np.int32)  # Flatten and convert to int32
        return metric.compute(predictions=predictions, references=labels)


    training_args = TrainingArguments(
        output_dir="test_trainer_gpt2",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        max_steps=100,
        report_to="none",
        gradient_checkpointing=True,
        fp16=True,
        bf16=False,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16
    )

    train_dataset = ray.train.get_dataset_shard("train").iter_torch_batches(batch_size=1)
    eval_dataset = ray.train.get_dataset_shard("validation").iter_torch_batches(batch_size=1)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator
    )

    trainer.add_callback(RayTrainReportCallback())
    trainer = prepare_trainer(trainer)
    trainer.train()

# Main function
if __name__ == "__main__":
    ray.shutdown()
    ray.init(log_to_driver=True, logging_level=logging.DEBUG, ignore_reinit_error=True)
    train_split = "train[:1%]"
    validation_split = "validation[:1%]"
    hf_datasets = load_dataset("wikitext", "wikitext-2-raw-v1", split={'train': train_split, 'validation': validation_split})
    processed_ds = hf_datasets.map(preprocess, batched=True, batch_size=1000)
    processed_ds = processed_ds.remove_columns("text")

    ray_train_ds = ray.data.from_huggingface(processed_ds["train"])
    ray_eval_ds = ray.data.from_huggingface(processed_ds["validation"])

    logging.info("Configuring Ray Trainer...")
    ray_trainer = TorchTrainer(
        train_func,
        scaling_config=ScalingConfig(num_workers=10, use_gpu=True),
        datasets={"train": ray_train_ds, "validation": ray_eval_ds},
        run_config=RunConfig(storage_path=storage_path, name="gpt2_experiment")
    )
    logging.info("Starting the Ray training process...")
    result = ray_trainer.fit()
    logging.info("Ray training process completed.")


Overwriting train_gpt2_script.py


In [263]:
import boto3

# S3 bucket definition and upload of the training script
s3_name_checkpoints = "datasets-checkpoints20240423160619841200000004"
s3_client = boto3.client("s3")
s3_client.upload_file("./train_gpt2_script.py", s3_name_checkpoints, "scripts/train_gpt2_script.py")

In [264]:
import ray
from ray.job_submission import JobSubmissionClient

# Submitting Training script to Ray
ray_train_address = "ray-cluster-train-kuberay-head-svc.ray-cluster-train.svc.cluster.local"
ray_client = JobSubmissionClient(f"http://{ray_train_address}:8265")
s3_name_checkpoints = "datasets-checkpoints20240423160619841200000004"
train_dependencies = [
    "transformers",
    "datasets",
    "boto3",
    "numpy",
    "evaluate"
]

submission_id = ray_client.submit_job(
    # Entrypoint shell command to execute
    entrypoint=(
        f"rm -rf train_gpt2_script.py && aws s3 cp s3://{s3_name_checkpoints}/scripts/train_gpt2_script.py train_gpt2_script.py || true;"
        "chmod +x train_gpt2_script.py && python train_gpt2_script.py"
    ),
    runtime_env={
        "pip": train_dependencies
    }
)

2024-04-25 22:36:21,741	DEBUG utils.py:655 -- Using API server address http://ray-cluster-train-kuberay-head-svc.ray-cluster-train.svc.cluster.local:8265.
2024-04-25 22:36:21,754	DEBUG validation.py:197 -- Rewrote runtime_env `pip` field from ['transformers', 'datasets', 'boto3', 'numpy', 'evaluate'] to {'packages': ['transformers', 'datasets', 'boto3', 'numpy', 'evaluate'], 'pip_check': False}.
2024-04-25 22:36:21,755	DEBUG validation.py:197 -- Rewrote runtime_env `pip` field from {'packages': ['transformers', 'datasets', 'boto3', 'numpy', 'evaluate'], 'pip_check': False} to {'packages': ['transformers', 'datasets', 'boto3', 'numpy', 'evaluate'], 'pip_check': False}.


In [3]:
! ray --version && python --version

2024-04-25 15:23:42,950 - INFO - Note: NumExpr detected 48 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-04-25 15:23:42,950 - INFO - NumExpr defaulting to 8 threads.
ray, version 2.11.0
[0mPython 3.11.8


# Finetuned model Inference

In [266]:
import boto3
import os

def download_latest_checkpoint(bucket_name, base_folder, local_directory):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    checkpoints = []

    # Listing all objects within the base folder
    for page in paginator.paginate(Bucket=bucket_name, Prefix=base_folder):
        for obj in page.get('Contents', []):
            key = obj['Key']
            if key.endswith('/') and 'checkpoint' in key:
                checkpoints.append(key)

    if not checkpoints:
        print("No checkpoints found.")
        return

    # Sorting to find the latest
    latest_checkpoint = sorted(checkpoints)[-1]
    print("Latest checkpoint:", latest_checkpoint)

    # Download files from the latest checkpoint
    for page in paginator.paginate(Bucket=bucket_name, Prefix=latest_checkpoint):
        for obj in page.get('Contents', []):
            key = obj['Key']
            local_file_path = os.path.join(local_directory, key[len(latest_checkpoint):])
            if not key.endswith('/'):  # Skip directories
                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
                s3.download_file(bucket_name, key, local_file_path)
                print(f'Downloaded: {key} to {local_file_path}')
    print("All files from the latest checkpoint are downloaded.")

bucket_name = "datasets-checkpoints20240423160619841200000004"
base_folder = "checkpoints/gpt2_experiment/TorchTrainer_4790d_00000_0_2024-04-25_15-36-38/" # Can see what folder in Ray train Logs
local_directory = "./latest_model_checkpoint_gpt2"

download_latest_checkpoint(bucket_name, base_folder, local_directory)


Latest checkpoint: checkpoints/gpt2_experiment/TorchTrainer_4790d_00000_0_2024-04-25_15-36-38/checkpoint_000044/checkpoint/
Downloaded: checkpoints/gpt2_experiment/TorchTrainer_4790d_00000_0_2024-04-25_15-36-38/checkpoint_000044/checkpoint/config.json to ./latest_model_checkpoint_gpt2/config.json
Downloaded: checkpoints/gpt2_experiment/TorchTrainer_4790d_00000_0_2024-04-25_15-36-38/checkpoint_000044/checkpoint/generation_config.json to ./latest_model_checkpoint_gpt2/generation_config.json
Downloaded: checkpoints/gpt2_experiment/TorchTrainer_4790d_00000_0_2024-04-25_15-36-38/checkpoint_000044/checkpoint/model.safetensors to ./latest_model_checkpoint_gpt2/model.safetensors
Downloaded: checkpoints/gpt2_experiment/TorchTrainer_4790d_00000_0_2024-04-25_15-36-38/checkpoint_000044/checkpoint/optimizer.pt to ./latest_model_checkpoint_gpt2/optimizer.pt
Downloaded: checkpoints/gpt2_experiment/TorchTrainer_4790d_00000_0_2024-04-25_15-36-38/checkpoint_000044/checkpoint/rng_state_0.pth to ./latest_

In [268]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def generate_text(prompt, tokenizer, model, max_length=512):
    # Encode the input prompt
    inputs = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text using the model
    outputs = model.generate(inputs, max_length=max_length, num_beams=5, early_stopping=True)

    # Decode and return the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Load tokenizer and model
local_directory = "./latest_model_checkpoint_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained(local_directory)

model.eval()
print("GPT-2 Model loaded successfully!")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


GPT-2 Model loaded successfully!
Generated text: The food at this restaurant was absolutely wonderful, from preparation to presentation, very pleasing. The service was excellent and the atmosphere was warm and inviting. I would highly recommend this place to anyone who is looking for a quick lunch or dinner fix.



In [None]:
# Sample prompt for generation
sample_prompt = "What were the main causes of the American Civil War?"

# Generate text
generated_text = generate_text(sample_prompt, tokenizer, model)
print("Generated text:", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
