# Setup

In [None]:
# Install python packages
!pip install datasets transformers[torch] accelerate -U

In [None]:
# Upload training file
from google.colab import files
uploaded = files.upload()

In [None]:
import json
from datasets import Dataset

# Load the fine-tuning data - *** edit filename below if necessary ***
with open("filename.json") as f:
    data = json.load(f)

# Reduce dataset size to n samples if needed:
# n = 100
# data = data[:n]

# Create a Dataset
dataset = Dataset.from_dict(formatted_data)


# TinyLlama 1.1B Chat

In [None]:
# TinyLlama 1.1B Chat
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch

# Load the tokenizer and model
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples["input_text"], max_length=64, truncation=True, padding="max_length")
    targets = tokenizer(examples["target_text"], max_length=64, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])

# Set up training arguments with reduced batch size and max sequence length
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Further reduced batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Accumulate gradients over 8 steps
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    save_steps=500,
)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Check if GPU is available and move the model to GPU
if torch.cuda.is_available():
    model = model.cuda()

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()

# Microsoft Phi-1.5

In [None]:
# Microsoft Phi-1.5
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch

# Load the tokenizer and model
model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add a padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Ensure use_cache is set to False in the model configuration
model.config.use_cache = False

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples["input_text"], max_length=32, truncation=True, padding="max_length")
    targets = tokenizer(examples["target_text"], max_length=32, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Assuming 'dataset' is already loaded as a Dataset object
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Reduced batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,  # Increased to accumulate more gradients
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    save_steps=500,
)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Check if GPU is available and move the model to GPU
if torch.cuda.is_available():
    model = model.cuda()

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

# Zip the model directory
import zipfile
import os

zip_path = './finetuned_model.zip'
unzip_dir = './finetuned_model'

with zipfile.ZipFile(zip_path, 'w') as zipf:
    for root, dirs, files in os.walk(unzip_dir):
        for file in files:
            zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), unzip_dir))

# Optional: Download the model zip file to local machine
from google.colab import files
files.download(zip_path)


# Llama-2-7b
(will need to auth via Hugging Face and accept Meta's terms)

In [None]:
# Install hugging face hub
!pip install huggingface_hub

# Authenticate with Hugging Face - enter auth token in cell below
from huggingface_hub import notebook_login

notebook_login()


In [None]:
# Clear GPU cache
import torch
torch.cuda.empty_cache()

# Load the tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

model_name = "meta-llama/Llama-2-7b-chat"  # Replace with the actual model name on Hugging Face
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add a padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Ensure use_cache is set to False in the model configuration
model.config.use_cache = False

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples["input_text"], max_length=64, truncation=True, padding="max_length")
    targets = tokenizer(examples["target_text"], max_length=64, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Assuming 'dataset' is already loaded as a Dataset object
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Reduced batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,  # Increased to accumulate more gradients
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    save_steps=500,
)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Check if GPU is available and move the model to GPU
if torch.cuda.is_available():
    model = model.cuda()

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("./finetuned_model")
tokenizer.save_pretrained("./finetuned_model")

# Zip the model directory
import zipfile
import os

zip_path = './finetuned_model.zip'
unzip_dir = './finetuned_model'

with zipfile.ZipFile(zip_path, 'w') as zipf:
    for root, dirs, files in os.walk(unzip_dir):
        for file in files:
            zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), unzip_dir))

# Optional: Download the model zip file to local machine
from google.colab import files
files.download(zip_path)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch

# Clear GPU cache
torch.cuda.empty_cache()

# Load the tokenizer and model
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add a padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Ensure use_cache is set to False in the model configuration
model.config.use_cache = False

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples["input_text"], max_length=64, truncation=True, padding="max_length")
    targets = tokenizer(examples["target_text"], max_length=64, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,  # Reduced batch size
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,  # Increased to accumulate more gradients
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    save_steps=500,
)

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Check if GPU is available and move the model to GPU
if torch.cuda.is_available():
    model = model.cuda()


In [None]:
# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()


# Deepseek Coder


In [None]:

# Load the tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples["input_text"], max_length=64, truncation=True, padding="max_length")
    targets = tokenizer(examples["target_text"], max_length=64, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Assuming 'dataset' is defined somewhere in your code
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["input_text", "target_text"])

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,  # Reduced learning rate
    per_device_train_batch_size=2,  # Adjust according to the model capabilities
    per_device_eval_batch_size=2,  # Adjust if necessary
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    num_train_epochs=5,  # Increase number of epochs
    weight_decay=0.01,
    fp16=False,  # Adjust if supported
    logging_dir='./logs',
    logging_steps=50,  # More frequent logging
    save_total_limit=2,
    save_steps=250,
)

# Check if GPU is available and move the model to GPU
if torch.cuda.is_available():
    model = model.cuda()

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()

# Save Model


In [None]:
# save model files
model.save_pretrained("./finetuned_model")

In [None]:
# save tokenizer files
tokenizer.save_pretrained("./finetuned_model")

In [None]:
# download fine tuned model as zip file
!zip -r finetuned_model.zip ./finetuned_model

from google.colab import files
files.download("finetuned_model.zip")


In [None]:
# Clear GPU cache if maxed out
torch.cuda.empty_cache()

In [None]:
# mount the drive for model persistence
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# download to Google Drive
!cp finetuned_model.zip /content/drive/MyDrive/

# **-------------------------------**
# Use the model from the zip file

In [None]:
# mount the drive if you haven't yet in previous lines
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# unzip the file
import zipfile

# Change the path to where your zip file is located
zip_path = '/content/drive/MyDrive/finetuned_model.zip'
unzip_dir = '/content/finetuned_model'

# Unzip the model
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_dir)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model from the saved directory - verify
model_path = "./finetuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Example input text
input_text = "CUSTOM_DB_42_QUERY Find all claims in master_claim_schema.claim_info with suspicious descriptions that include the word 'arson'."

# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt")

# Control output -- minimal config
outputs = model.generate(
    **inputs,
    max_length=128,
    repetition_penalty=1.4
)

# Control output -- more default version of config - tested well with DeepSeek Coder
# outputs = model.generate(
#     **inputs,
#     max_length=128,  # Adjust max_length to a suitable value for your SQL queries
#     repetition_penalty=1.0,  # Reset to default
#     temperature=1.0,  # Reset to default, controls the randomness of predictions
#     top_k=50,  # Default value, controls sampling diversity
#     top_p=1.0,  # Default value, controls nucleus sampling
#     num_beams=1  # Default value, no beam search
# )

# Custom outputs
# outputs = model.generate(
#     **inputs,
#     max_length=128,  # Adjust max_length to a suitable value for your SQL queries
#     repetition_penalty=1.3,
#     temperature=1.0,  # Reset to default, controls the randomness of predictions
#     top_k=50,  # Default value, controls sampling diversity
#     top_p=1.0,  # Default value, controls nucleus sampling
#     num_beams=1  # Default value, no beam search
# )


# Decode the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
