In [None]:
!pip install transformers datasets ray[default] ray[tune] ray[serve] boto3

In [None]:
%%writefile train_gpt2_script.py
import os
import logging
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer
import ray
from ray.train.huggingface.transformers import RayTrainReportCallback, prepare_trainer
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig, RunConfig
from datasets import load_dataset
import evaluate
import numpy as np

# Setup basic configuration for logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Variables
s3_bucket_name = "datasets-checkpoints20240409144007926200000004"
storage_path = f"s3://{s3_bucket_name}/checkpoints/"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token

def preprocess(examples):
    output = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    return {
        "input_ids": output.input_ids.long(),
        "attention_mask": output.attention_mask.long(),
        "labels": output.input_ids.clone()
    }

# Define the full training function
def train_func():
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

    # Evaluation Metrics
    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        # Ensure that predictions and labels are numpy arrays of type int32
        predictions = predictions.flatten().astype(np.int32)  # Flatten and convert to int32
        labels = labels.flatten().astype(np.int32)  # Flatten and convert to int32
        return metric.compute(predictions=predictions, references=labels)


    training_args = TrainingArguments(
        output_dir="test_trainer_gpt2",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        max_steps=100,
        report_to="none",
        gradient_checkpointing=True,
        fp16=True,
        bf16=False,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16
    )

    train_dataset = ray.train.get_dataset_shard("train").iter_torch_batches(batch_size=1)
    eval_dataset = ray.train.get_dataset_shard("validation").iter_torch_batches(batch_size=1)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator
    )

    trainer.add_callback(RayTrainReportCallback())
    trainer = prepare_trainer(trainer)
    trainer.train()

# Main function
if __name__ == "__main__":
    ray.shutdown()
    ray.init(log_to_driver=True, logging_level=logging.DEBUG, ignore_reinit_error=True)
    train_split = "train[:1%]"
    validation_split = "validation[:1%]"
    hf_datasets = load_dataset("wikitext", "wikitext-2-raw-v1", split={'train': train_split, 'validation': validation_split})
    processed_ds = hf_datasets.map(preprocess, batched=True, batch_size=1000)
    processed_ds = processed_ds.remove_columns("text")

    ray_train_ds = ray.data.from_huggingface(processed_ds["train"])
    ray_eval_ds = ray.data.from_huggingface(processed_ds["validation"])

    logging.info("Configuring Ray Trainer...")
    ray_trainer = TorchTrainer(
        train_func,
        scaling_config=ScalingConfig(num_workers=10, use_gpu=True),
        datasets={"train": ray_train_ds, "validation": ray_eval_ds},
        run_config=RunConfig(storage_path=storage_path, name="gpt2_experiment")
    )
    logging.info("Starting the Ray training process...")
    result = ray_trainer.fit()
    logging.info("Ray training process completed.")


In [58]:
import boto3

# S3 bucket definition and upload of the training script
s3_name_checkpoints = "datasets-checkpoints20240409144007926200000004"
s3_client = boto3.client("s3")
s3_client.upload_file("./train_gpt2_script.py", s3_name_checkpoints, "scripts/train_gpt2_script.py")

In [4]:
import ray
from ray.job_submission import JobSubmissionClient

# Submitting Training script to Ray
ray_train_address = "ray-cluster-train-kuberay-head-svc.ray-cluster-train.svc.cluster.local"
ray_client = JobSubmissionClient(f"http://{ray_train_address}:8265")
s3_name_checkpoints = "datasets-checkpoints20240409144007926200000004"
train_dependencies = [
    "transformers",
    "datasets",
    "boto3",
    "numpy",
    "evaluate"
]

submission_id = ray_client.submit_job(
    # Entrypoint shell command to execute
    entrypoint=(
        f"rm -rf train_gpt2_script.py && aws s3 cp s3://{s3_name_checkpoints}/scripts/train_gpt2_script.py train_gpt2_script.py || true;"
        "chmod +x train_gpt2_script.py && python train_gpt2_script.py"
    ),
    runtime_env={
        "pip": train_dependencies
    }
)

In [None]:
! ray --version && python --version

# Finetuned model Inference

In [None]:
import boto3
import os

def download_latest_checkpoint(bucket_name, base_folder, local_directory):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    checkpoints = []

    # Listing all objects within the base folder
    for page in paginator.paginate(Bucket=bucket_name, Prefix=base_folder):
        for obj in page.get('Contents', []):
            key = obj['Key']
            if key.endswith('/') and 'checkpoint' in key:
                checkpoints.append(key)

    if not checkpoints:
        print("No checkpoints found.")
        return

    # Sorting to find the latest
    latest_checkpoint = sorted(checkpoints)[-1]
    print("Latest checkpoint:", latest_checkpoint)

    # Download files from the latest checkpoint
    for page in paginator.paginate(Bucket=bucket_name, Prefix=latest_checkpoint):
        for obj in page.get('Contents', []):
            key = obj['Key']
            local_file_path = os.path.join(local_directory, key[len(latest_checkpoint):])
            if not key.endswith('/'):  # Skip directories
                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
                s3.download_file(bucket_name, key, local_file_path)
                print(f'Downloaded: {key} to {local_file_path}')
    print("All files from the latest checkpoint are downloaded.")

bucket_name = "datasets-checkpoints20240409144007926200000004"
base_folder = "checkpoints/gpt2_experiment/TorchTrainer_fbeef_00000_0_2024-05-17_01-18-04/" # Can see what folder in Ray train Logs
local_directory = "./latest_model_checkpoint_gpt2"

download_latest_checkpoint(bucket_name, base_folder, local_directory)


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import transformers

# Reduce verbosity to avoid unnecessary logs during model loading
transformers.logging.set_verbosity_error()

def generate_text(prompt, tokenizer, model, max_length=100):
    # Encode the input prompt and move to the appropriate device
    inputs = tokenizer.encode(prompt, return_tensors='pt').to(model.device)

    # Generate text using the model with adjusted settings for completeness
    outputs = model.generate(
        inputs, 
        max_length=max_length, 
        num_beams=5, 
        early_stopping=False,
        no_repeat_ngram_size=2,
        temperature=0.8,
        top_k=50,
        do_sample=True
    )
    print(outputs)
    # Decode and return the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Load tokenizer and model
local_directory = "./latest_model_checkpoint_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained(local_directory)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Sample prompt for generation
sample_prompt = "What are the major achievements of Marie Curie?"

# Generate text
generated_text = generate_text(sample_prompt, tokenizer, model)
print("Generated text:", generated_text)


# Serving Script

Serving script will be created and uploaded to S3 in ZIP format, then we will generate a PreSigned URL to use in the RayService CRD

In [None]:
%%writefile gpt2_serve_script.py
import os
import boto3
import ray
from ray import serve
from starlette.requests import Request
import torch
import torch.nn.functional as F
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import logging

def download_latest_checkpoint(bucket_name, base_folder, local_directory):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    checkpoints = []

    for page in paginator.paginate(Bucket=bucket_name, Prefix=base_folder):
        for obj in page.get('Contents', []):
            key = obj['Key']
            if key.endswith('/') and 'checkpoint' in key:
                checkpoints.append(key)

    if not checkpoints:
        print("No checkpoints found.")
        return

    latest_checkpoint = sorted(checkpoints)[-1]
    print("Latest checkpoint:", latest_checkpoint)

    for page in paginator.paginate(Bucket=bucket_name, Prefix=latest_checkpoint):
        for obj in page.get('Contents', []):
            key = obj['Key']
            local_file_path = os.path.join(local_directory, key[len(latest_checkpoint):])
            if not key.endswith('/'):
                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
                s3.download_file(bucket_name, key, local_file_path)
                print(f'Downloaded: {key} to {local_file_path}')
    print("All files from the latest checkpoint are downloaded.")


@serve.deployment(ray_actor_options={"num_gpus": 1})
class TextGenerationDeployment:
    def __init__(self, bucket_name, base_folder):
        logger = logging.getLogger("ray.serve")
        logger.info("Inside TextGenerationDeployment init")
        local_directory = "./latest_model_checkpoint"
        download_latest_checkpoint(bucket_name, base_folder, local_directory)

        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
        self.model = GPT2LMHeadModel.from_pretrained(local_directory)
        self.model.eval()
        self.logger = logger
        self.logger.info("Model Loading complete")

    def generate_text(self,prompt):
        inputs = self.tokenizer.encode(prompt, return_tensors='pt')
        outputs = self.model.generate(
            inputs, 
            max_length=100, 
            num_beams=5, 
            early_stopping=False,
            no_repeat_ngram_size=2,
            temperature=0.8,
            top_k=50,
            do_sample=True
        )
        # Decode and return the generated text
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        self.logger.info("Prompt",prompt)
        self.logger.info("Response",generated_text)
        return generated_text

    async def __call__(self, request: Request):
        body = await request.json()
        prompt = body['prompt']
        generated_text = self.generate_text(prompt)
        return generated_text
        
bucket_name = "datasets-checkpoints20240409144007926200000004"
base_folder = "checkpoints/gpt2_experiment/TorchTrainer_fbeef_00000_0_2024-05-17_01-18-04/"
text_generate_deployment = TextGenerationDeployment.bind(bucket_name=bucket_name, base_folder=base_folder)


In [108]:
import boto3

bucket_name = "datasets-checkpoints20240409144007926200000004"
s3_client = boto3.client("s3")
s3_client.upload_file("./gpt2_serve_script.py", bucket_name, "scripts/gpt2_serve_script.py")

In [None]:
import boto3
from zipfile import ZipFile

s3_client = boto3.client("s3")
bucket = "datasets-checkpoints20240409144007926200000004"

with ZipFile('./gpt2_finetuned.zip', 'w') as zip_object:
    zip_object.write('./gpt2_serve_script.py')

s3_client.upload_file("./gpt2_finetuned.zip", bucket, "gpt2_finetuned.zip")
presigned_url = s3_client.generate_presigned_url(
    'get_object',
    Params={'Bucket': bucket, 'Key': "gpt2_finetuned.zip"},
    ExpiresIn=7200
)

print("Pre-signed URL:", presigned_url)


## Sending Request to Finetuned GPT2

In [110]:
import requests
import json

In [111]:
url = "http://ray-svc-gpt2-head-svc.ray-svc-gpt2.svc.cluster.local:8000/gpt2_generate"

In [112]:
def generate_text(prompt):
    headers = {'Content-type': 'application/json'}
    data = json.dumps({"prompt": prompt})
    response = requests.post(url, data=data, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        return "Error:", response.status_code, response.text


In [None]:
prompts = [
    "What are the major achievements of Marie Curie?",
    "When did the second world war happen?",
    "What is the capital of United Kingdom?"
    
]

for prompt in prompts:
    result = generate_text(prompt)
    print(result)
    print()  # Adding a newline for better readability between results

