In [1]:
from importlib import reload
import json
import random
from models.finetuning import *
from data.dataset import GPTDataFormatter

data_size = 'medium'
json_file_path = TRAIN_DATA = f"/datasets/sources/train_gts_with_pop.json"
HOLDOUT_DATA = f"/datasets/sources/holdout_gts_with_pop.json"
VAL_DATA = f"/datasets/sources/val_gts_with_pop.json"
model_name = 'gpt2'

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"

# 1. Pre-trained GPT from HF for benchmarking (No fine tuning)

In [3]:
with open(json_file_path, 'r') as f:
    original_genotypes = json.load(f)

sample_ids = list(original_genotypes.keys())
chosen_sample_id = random.choice(sample_ids)

prompt = original_genotypes[chosen_sample_id]['genotypes']

In [11]:
print(prompt[:20])

 22:16056839:C>T_0|0


In [13]:
num_samples = 10
prompt_length = 20
num_muts = 500

pretrained_model = PretrainedGPT(model_name=model_name, device='cpu')
benchmark_samples = pretrained_model.generate(prompt[:prompt_length], 
                                              samples_to_generate=num_samples, 
                                              max_length=num_muts, 
                                              save_path=f'baseline_gpt2_pretrained.json')
for sample_id, gt in benchmark_samples.items():
    print(f"Sample ID: {sample_id}\nGenerated Genotype: {gt}\n")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Sample ID: synth_1
Generated Genotype:  22:16056839:C>T_0|0|r31|4145|Beth|d[5]1[1]|0|r31|4143|Terrace Wiccan - Do You (feat. T.J. Cole)|4.0|0|2 3646|Dirty Mustard - N.Y.D.I.P. (feat. Pharrell Williams)|4.0|0|1 3646|Frank Ocean - The End of The World - 2008 Remastered Version|4.0|0|3 3646|Drake - I Need You (feat. Lil Uzi Vert)|4.0|0|2 3646|Barry Jackson - I Feel Lucky|4.0|0|2 3646|Drake - No Good|4.0|0|2 3646|Drake - No Love|4.0|0|2 3646|Drake - Never Fade Away|4.0|0|2 3646|Drake - Red|4.0|0|1 3646|Daniel Caesar - Love|4.0|0|1 3646|danielcalabo - The Heart-Lust of It|4.0|0|1 3646|Doink - All I Want|4.0|0|1 3646|Dirty Jeezy - We're All Sober|4.0|0|1 3646|Denzel Curry - The Road|4.0|0|1 3646|Denzel Curry - We Love You|4.0|0|1 3646|Denzel Curry - I'm Waiting|4.0|0|1 3646|Denzel Curry - My God|4.0|0|1 3646|Denzel Curry - Next Love|4.0|0|1 3646|Daft Punk - Sober|4.0|0|1 3646|Daft Punk - I'm the Most Beautiful Man|4.0|0|1 3646|FKA twigs - P.D.O.S.I.D.|4.0|0|3 3646|FKA twigs - Stay On My Own|

___

# 2. Finetuning GPT from HF

## 2.A Without DP

### 2.a Finetuning/Training

In [3]:
# Initialize components
formatter = GPTDataFormatter()

print("1. Loading and formatting dataset...")
# Load data
original_genotypes = formatter.load_data_from_json(json_file_path)
print(f"   Loaded {len(original_genotypes)} samples")
# Format data
formatted_genotype_seqs = formatter.get_training_corpus(original_genotypes)
print(f"   Formatted {len(formatted_genotype_seqs)} sample sequences")
    

1. Loading and formatting dataset...
   Loaded 2504 samples
   Formatted 2504 sample sequences


In [4]:
print("2. Setting up model and tokenizer...")
# Setup model and tokenizer
# Initialize trainer
trainer = FinetuningTrainer(model_name=model_name,
                special_tokens=formatter.special_tokens)
print(f"   Model vocabulary size: {len(trainer.tokenizer)}")

print("3. Preparing training pipeline...")
# Prepare datasets
train_dataset, eval_dataset = trainer.setup_training_data(formatted_genotype_seqs)
print(f"   Training samples: {len(train_dataset)}")
print(f"   Evaluation samples: {len(eval_dataset)}")

2. Setting up model and tokenizer...
Running on cuda
   Model vocabulary size: 50266
3. Preparing training pipeline...
   Training samples: 1752
   Evaluation samples: 752


In [5]:
print("4. Training model...")
# Setup training arguments
batch_size = 4 # higher gives CUDA out of memory errors on my device
training_args = trainer.setup_trainer(
    epochs=3,
    train_batch_size=batch_size,
    eval_batch_size=batch_size,
    learning_rate=1e-3)

# Train the model
trained_trainer = trainer.train_model(train_dataset, eval_dataset, training_args)

print("5. Evaluating model...")
# Evaluate
eval_results = trainer.evaluate_model(trained_trainer)

import wandb
wandb.finish()

4. Training model...
Starting training...


[34m[1mwandb[0m: Currently logged in as: [33mbelfiore-asia[0m ([33mbelfiore-asia-imperial-college-london[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,8.3621
200,0.0428
300,0.0071
400,0.0041
500,0.0232
600,0.0032


5. Evaluating model...
Evaluating model...

Results:
  eval_loss: 0.001834377646446228
  eval_runtime: 143.1306
  eval_samples_per_second: 5.254
  eval_steps_per_second: 1.313
  epoch: 3.0


VBox(children=(Label(value='0.002 MB of 0.018 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.106876…

0,1
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▄▅▆▇██
train/global_step,▁▂▄▅▆▇██
train/learning_rate,▁▃▅▆█▃
train/loss,█▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.00183
eval/runtime,143.1306
eval/samples_per_second,5.254
eval/steps_per_second,1.313
train/epoch,3.0
train/global_step,657.0
train/learning_rate,0.0004
train/loss,0.0032
train/total_flos,2682326016000000.0
train/train_loss,1.28518


In [34]:
with open("/datasets/sources/genotypes_medium_chr22.json", 'r') as f:
    original_genotypes = json.load(f)

sample_ids = list(original_genotypes.keys())
chosen_sample_id = random.choice(sample_ids)

prompt = original_genotypes[chosen_sample_id]['genotypes'][:20]
print(prompt)

 22:16056839:C>T_0|0


In [9]:
print("6. Generating sample outputs...")
# Generate some samples to test
samples = generate_sample(model=trainer.model,
                        tokenizer=trainer.tokenizer,
                        formatter=formatter,
                        samples_to_generate=1,
                        max_sample_length=500,
                        prompt=prompt,
                        return_tensors=False,)

generated_samples = {f'synth_{i+1}':sample for i, sample in enumerate(samples)}
    
for idx, genotype in generated_samples.items():
    print(f"   {idx}: {genotype[200:]} ...")

6. Generating sample outputs...
Generating on cuda
   synth_1: 22:16163523:T>A_0|0 22:16185747:T>C_0|0 22:16197860:A>G_0|0 22:16202129:C>T_0|0 22:16223429:A>T_0|0 22:16237892:A>G_0|0 22:16239684:G>A_0|0 22:16244406:A>C_0|0 22:16265087:T>C_0|0 22:16268948:G>C_0|0 22:16300070:C>T_0|0 22:16336692:G>T_0|0 22:16340011:C>T_0|0 22:16341823:C>T_0|0 22:16346577:A>G_0|0 22:16353763:G>A_0|0  ...


___

### 2.b Inference from saved model

In [None]:
from importlib import reload
import json
import random
from models.finetuning import *
from data.dataset import GPTDataFormatter
from transformers import AutoTokenizer, AutoModelForCausalLM

output_dir = "models/saved/GPT"

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir, load_in_4bit=True, device_map="auto")

2025-08-30 13:26:10.444820: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-30 13:26:10.444895: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-30 13:26:10.446453: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-30 13:26:10.456450: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Special tokens have been added in the vocabulary, mak

In [8]:
# Initialize trainer
formatter = GPTDataFormatter()
trainer = FinetuningTrainer(model_name='gpt2',
                special_tokens=formatter.special_tokens)
trainer.tokenizer = tokenizer
trainer.model = model

with open("/datasets/sources/genotypes_medium_chr22.json", 'r') as f:
    original_genotypes = json.load(f)

sample_ids = list(original_genotypes.keys())
chosen_sample_id = random.choice(sample_ids)

prompt = original_genotypes[chosen_sample_id]['genotypes'][:20]

Running on cuda


In [10]:
print("Generating sample outputs...")
# Generate some samples to test
samples = generate_sample(model=trainer.model,
                        tokenizer=trainer.tokenizer,
                        formatter=formatter,
                        samples_to_generate=10,
                        max_sample_length=500,
                        prompt=prompt,
                        return_tensors=False,)

generated_samples = {f'synth_{i+1}':sample for i, sample in enumerate(samples)}
with open('finetuned_gpt2_10samples.json', 'w+') as f:
    json.dump(generated_samples, f, sort_keys=False, indent=4)
    
for idx, genotype in generated_samples.items():
    print(f"   {idx}: {genotype[200:]} ...")

Generating sample outputs...
Generating on cuda
   synth_1: 22:16163523:T>A_0|0 22:16185747:T>C_0|0 22:16197860:A>G_0|0 22:16202129:C>T_0|0 22:16223429:A>T_0|0 22:16237892:A>G_0|0 22:16239684:G>A_0|0 22:16244406:A>C_0|0 22:16265087:T>C_0|0 22:16268948:G>C_0|0 22:16300070:C>T_0|0 22:16336692:G>T_0|0 22:16340011:C>T_0|0 22:16341823:C>T_0|0 22:16346577:A>G_0|0 22:16353763:G>A_0|0  ...
   synth_2: 22:16163523:T>A_0|0 22:16185747:T>C_0|0 22:16197860:A>G_0|0 22:16202129:C>T_0|0 22:16223429:A>T_0|0 22:16237892:A>G_0|0 22:16239684:G>A_0|0 22:16244406:A>C_0|0 22:16265087:T>C_0|0 22:16268948:G>C_0|0 22:16300070:C>T_0|0 22:16336692:G>T_0|0 22:16340011:C>T_0|0 22:16341823:C>T_0|0 22:16346577:A>G_0|0 22:16353763:G>A_0|0  ...
   synth_3: 22:16163523:T>A_0|0 22:16185747:T>C_0|0 22:16197860:A>G_0|0 22:16202129:C>T_0|0 22:16223429:A>T_0|0 22:16237892:A>G_0|0 22:16239684:G>A_0|0 22:16244406:A>C_0|0 22:16265087:T>C_0|0 22:16268948:G>C_0|0 22:16300070:C>T_0|0 22:16336692:G>T_0|0 22:16340011:C>T_0|0 22:163

## 2.A.2 Without DP

In [2]:
from importlib import reload

import models.finetuning
reload(models.finetuning)
from models.finetuning import *

In [4]:
# Initialize components
formatter = GPTDataFormatter(custom=True)

print("Loading and formatting dataset...")
# Load data
original_genotypes = formatter.load_data_from_json(json_file_path)
print(f"   Loaded {len(original_genotypes)} samples")
# Format data
formatted_genotype_seqs = formatter.get_training_corpus(original_genotypes)
print(f"   Formatted {len(formatted_genotype_seqs)} sample sequences")
    

Loading and formatting dataset...
   Loaded 1752 samples
   Formatted 1752 sample sequences


In [5]:
print("Setting up model and tokenizer:")
# Setup model and tokenizer
# Initialize trainer
trainer = FinetuningTrainer(model_name='gpt2',
                            special_tokens=formatter.special_tokens,
                            use_privacy = False)
# print(f"   Model vocabulary size: {len(trainer.tokenizer)}")

Setting up model and tokenizer:
Running on cuda


Using pad_token, but it is not set yet.


In [10]:
print("Preparing training pipeline:")
# Prepare datasets
train_dataset, eval_dataset = trainer.setup_training_data(formatted_genotype_seqs)
print(f"   Training samples: {len(train_dataset)}")
print(f"   Evaluation samples: {len(eval_dataset)}")

Preparing training pipeline:
   Training samples: 1226
   Evaluation samples: 526


In [11]:
import torch
torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [12]:
import torch

allocated_memory = torch.cuda.memory_allocated()
print(f"Allocated memory: {allocated_memory / (1024 ** 3):.2f} GB")
total_memory = torch.cuda.memory_reserved()
print(f"Total reserved memory: {total_memory / (1024 ** 3):.2f} GB")
free_memory = torch.cuda.memory_allocated()
print(f"Free memory: {(torch.cuda.get_device_properties(0).total_memory - free_memory) / (1024 ** 3):.2f} GB")

Allocated memory: 0.00 GB
Total reserved memory: 0.00 GB
Free memory: 15.88 GB


In [13]:
print("Training model...")
# Setup training arguments
batch_size = 4 # higher gives CUDA out of memory errors on my device
training_args = trainer.setup_trainer(
    epochs=5,
    train_batch_size=batch_size,
    eval_batch_size=batch_size,
    learning_rate=1e-3)

# Train the model
trained_trainer = trainer.train_model(train_dataset, eval_dataset, training_args)

Training model...


Using cuda_amp half precision backend
***** Running training *****
  Num examples = 1226
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 380


Starting training...


Step,Training Loss
100,6.7069
200,0.0055
300,0.0038


Saving model checkpoint to models/saved/GPT/checkpoint-100
Configuration saved in models/saved/GPT/checkpoint-100/config.json
Model weights saved in models/saved/GPT/checkpoint-100/pytorch_model.bin
tokenizer config file saved in models/saved/GPT/checkpoint-100/tokenizer_config.json
Special tokens file saved in models/saved/GPT/checkpoint-100/special_tokens_map.json
added tokens file saved in models/saved/GPT/checkpoint-100/added_tokens.json
Deleting older checkpoint [models/saved/GPT/checkpoint-200] due to args.save_total_limit
Saving model checkpoint to models/saved/GPT/checkpoint-200
Configuration saved in models/saved/GPT/checkpoint-200/config.json
Model weights saved in models/saved/GPT/checkpoint-200/pytorch_model.bin
tokenizer config file saved in models/saved/GPT/checkpoint-200/tokenizer_config.json
Special tokens file saved in models/saved/GPT/checkpoint-200/special_tokens_map.json
added tokens file saved in models/saved/GPT/checkpoint-200/added_tokens.json
Deleting older chec

In [14]:
print("Evaluating model:")
# Evaluate
eval_results = trainer.evaluate_model(trained_trainer)

***** Running Evaluation *****
  Num examples = 526
  Batch size = 4


Evaluating model:
Evaluating model...

Results:
  eval_loss: 0.003627470228821039
  eval_runtime: 55.7009
  eval_samples_per_second: 9.443
  eval_steps_per_second: 2.37
  epoch: 4.99


In [15]:
if trainer.use_privacy:
    model_name = 'gpt2_dp'
else:
    model_name = 'gpt2'
trainer.model.save_pretrained(model_name, safe_serialization=True)

Configuration saved in gpt2/config.json
Model weights saved in gpt2/pytorch_model.bin


In [16]:
# Generate some samples to test
samples = generate_sample(model=trainer.model,
                        tokenizer=trainer.tokenizer,
                        formatter=formatter,
                        samples_to_generate=1,
                        max_sample_length=500,
                        prompt='22:53489070:A>T_0|0',
                        custom=True,
                        return_tensors=False,)

generated_samples = {f'synth_{i+1}':sample for i, sample in enumerate(samples)}
    
for idx, genotype in generated_samples.items():
    print(f"   {idx}: {genotype[200:]} ...")

Generating on cuda
   synth_1: <MUT_SEP> 22:16449284:G>A_0|0 <MUT_SEP> 22:16458713:T>C_0|0 <MUT_SEP> 22:16474360:A>T_0|0 <MUT_SEP> 22:16479441:T>C_0|0 <MUT_SEP> 22:16483254:C>T_0|0 <MUT_SEP> 22:16494517:T>C_0|0 <MUT_SEP> 22:16495350:G>C_0|0 <MUT_SEP> 22:16497549:C>T_0|0 <MUT_SEP> 22:16504136:A>C_0|0 <MUT_SEP> 22:16518108:G>A_0|0 <MUT_SEP> 22:16520561:C>T_0|0 <MUT_SEP> 22:16524903:G>A_0|0 <MUT_SEP> 22:16525634:C>T_0|0 <MUT_SEP> 22:16529941:T>C_0|0 <MUT_SEP> 22:16538239:C>T_0|0 <MUT_SEP> 22:16405364:G>T_0|0 <MUT_SEP> 22:16449210:G>T_0|0 <MUT_SEP> 22:16449284:G>A_0|0 <MUT_SEP> 22:16458713:T>C_0|0 <MUT_SEP> 22:16414587:G>T_0|0 <MUT_SEP> 22:16414672:C>A_0|0 <MUT_SEP> 22:16430991:A>G_0|0 <MUT_SEP> 22:16449210:G>A_0|0 <MUT_SEP> 22:16449284:G>A_0|0 <MUT_SEP> 22:16458713:T>C_0|0 <MUT_SEP> 22:16460766:G>A_0|0 <MUT_SEP> 22:16464821:T>TA_0|0 <MUT_SEP> 22:16474360:C>T_0|0 <MUT_SEP> 22:16479441:T ...


In [17]:
for idx, genotype in generated_samples.items():
    print(f"{idx}:    {genotype.strip('<MUT_SEP>').strip('<MUT_SEP>')} ...")

synth_1:    22:53489070:A>T_0|0 <MUT_SEP> 22:16353763:G>A_0|0 <MUT_SEP> 22:16380014:T>G_0|0 <MUT_SEP> 22:16395227:C>T_0|0 <MUT_SEP> 22:16398168:G>A_0|0 <MUT_SEP> 22:16405364:G>T_0|0 <MUT_SEP> 22:16449210:A>G_0|0 <MUT_SEP> 22:16449284:G>A_0|0 <MUT_SEP> 22:16458713:T>C_0|0 <MUT_SEP> 22:16474360:A>T_0|0 <MUT_SEP> 22:16479441:T>C_0|0 <MUT_SEP> 22:16483254:C>T_0|0 <MUT_SEP> 22:16494517:T>C_0|0 <MUT_SEP> 22:16495350:G>C_0|0 <MUT_SEP> 22:16497549:C>T_0|0 <MUT_SEP> 22:16504136:A>C_0|0 <MUT_SEP> 22:16518108:G>A_0|0 <MUT_SEP> 22:16520561:C>T_0|0 <MUT_SEP> 22:16524903:G>A_0|0 <MUT_SEP> 22:16525634:C>T_0|0 <MUT_SEP> 22:16529941:T>C_0|0 <MUT_SEP> 22:16538239:C>T_0|0 <MUT_SEP> 22:16405364:G>T_0|0 <MUT_SEP> 22:16449210:G>T_0|0 <MUT_SEP> 22:16449284:G>A_0|0 <MUT_SEP> 22:16458713:T>C_0|0 <MUT_SEP> 22:16414587:G>T_0|0 <MUT_SEP> 22:16414672:C>A_0|0 <MUT_SEP> 22:16430991:A>G_0|0 <MUT_SEP> 22:16449210:G>A_0|0 <MUT_SEP> 22:16449284:G>A_0|0 <MUT_SEP> 22:16458713:T>C_0|0 <MUT_SEP> 22:16460766:G>A_0|0 <MUT_SEP

### 2.b Inference from saved model

In [2]:
from importlib import reload
import json
import random
from models.finetuning import *
from data.dataset import GPTDataFormatter
from transformers import AutoTokenizer, AutoModelForCausalLM

output_dir = "models/saved/GPT"

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(f'{output_dir}/final')

In [3]:
# Initialize trainer
formatter = GPTDataFormatter()

with open("/datasets/sources/genotypes_medium_chr22.json", 'r') as f:
    original_genotypes = json.load(f)

sample_ids = list(original_genotypes.keys())
chosen_sample_id = random.choice(sample_ids)

prompt = original_genotypes[chosen_sample_id]['genotypes'][:20]

In [5]:
formatter.format_prompt(prompt, sample_id=None, pop_code=None)

'<START_SAMPLE>22:16056839:C>T_0|0'

In [9]:
samples = generate_sample(model=model,
                        tokenizer=tokenizer,
                        formatter=formatter,
                        samples_to_generate=1,
                        max_sample_length=1000,
                        prompt=prompt,
                        custom=False,
                        temperature=0.5,
                        return_tensors=False,
                        device=device)
samples

Generating on cpu
Prompting with <START_SAMPLE>22:16056839:C>T_0|0


['22:16056839:C>T_0|022:16059249:G>A_0|022:16059670:A>G_0|022:16059958:G>T_0|022:16061929:T>G_0|022:16085285:G>A_0|022:16123247:G>T_0|022:16138429:G>A_0|022:16138591:A>C_0|022:16145723:A>T_0|022:16163523:T>A_0|022:16185747:T>C_0|022:16197860:A>G_0|022:16202129:C>T_0|022:16223429:A>T_0|022:16237892:A>G_0|022:16239684:G>A_0|022:16244406:A>C_0|022:16265087:T>C_0|022:16268948:G>C_0|022:16300070:C>T_0|022:16336692:G>T_0|022:16340011:C>T_0|022:16341823:C>T_0|022:16346577:A>G_0|022:16353763:G>A_0|022:16380014:T>A_0|022:16395227:C>T_0|022:16398168:G>T_0|022:16405364:G>T_0|022:16414587:G>T_0|022:16414672:C>A_0|022:16430991:A>G_0|022:16449210:G>A_0|022:16449284:G>A_0|022:16458713:T>C_0|022:16460766:G>A_0|022:16464821:T>TA_0|022:16474360:C>T_0|022:16479441:T>C_0|022:16483254:C>T_0|022:16494517:T>C_0|022:16495350:G>C_0|022:16497549:C>T_0|022:16499699:G>A_0|022:16504136:A>C_0|022:16518108:G>A_0|022:16520561:C>T_0|022:16524903:G>A_0|022:16525634:T>G_0|022:16529941:T>C_0|022:16538239:C>A_0|022:165397

In [7]:
print("Generating sample outputs...")
# Generate some samples to test
device='cpu'
model.to(device)
samples = generate_sample(model=model,
                        tokenizer=tokenizer,
                        formatter=formatter,
                        samples_to_generate=50,
                        max_sample_length=5000,
                        prompt=prompt,
                        custom=True,
                        return_tensors=True,
                        device=device)

generated_samples = {f'synth_{i+1}':tokenizer.decode(sample) for i, sample in enumerate(samples)}

def clean_samples(generated_samples):
    for sample, genotype in generated_samples.items():
        genotype = genotype.strip('<START_SAMPLE>')
        genotype = genotype.replace('<MUT_SEP>', ' ')
        genotype = genotype.strip('<END_SAMPLE>')
        generated_samples[sample] = genotype
synthetic_samples = clean_samples(generated_samples)

for idx, genotype in generated_samples.items():
    print(f"   {idx}: {genotype} ...")

with open('finetuned_gpt2_50samples.json', 'w+') as f:
    json.dump(generated_samples, f, sort_keys=False, indent=4)
    print("Saved samples at 'finetuned_gpt2_10samples.json.")

Generating sample outputs...
Generating on cpu
Prompting with  22:16056839:C>T_0|0


KeyboardInterrupt: 

## 2.B With DP

In [2]:
# Initialize components
formatter = GPTDataFormatter(custom=True)

print("Loading and formatting dataset...")
# Load data
original_genotypes = formatter.load_data_from_json(json_file_path)
print(f"   Loaded {len(original_genotypes)} samples")
# Format data
formatted_genotype_seqs = formatter.get_training_corpus(original_genotypes)
print(f"   Formatted {len(formatted_genotype_seqs)} sample sequences")
    

Loading and formatting dataset...
   Loaded 1752 samples
   Formatted 1752 sample sequences


In [3]:
from importlib import reload

import models.finetuning
reload(models.finetuning)
from models.finetuning import *

In [3]:
print("Setting up model and tokenizer:")
# Setup model and tokenizer
# Initialize trainer
dp_trainer = FinetuningTrainer(output_dir="models/saved/GPT_DP",
                model_name='gpt2',
                special_tokens=GPT_SPECIAL_TOKENS,
                use_privacy=True,)
dp_trainer.target_epsilon = 8.0
# print(f"   Model vocabulary size: {len(trainer.tokenizer)}")

Setting up model and tokenizer:
Running on cuda


Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Froze 1 layer for Opacus compatibility


In [4]:
print("Preparing training pipeline:")
# Prepare datasets
train_dataset, eval_dataset = dp_trainer.setup_training_data(formatted_genotype_seqs)
print(f"   Training samples: {len(train_dataset)}")
print(f"   Evaluation samples: {len(eval_dataset)}")

Preparing training pipeline:
   Training samples: 1226
   Evaluation samples: 526


In [5]:
import torch
torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [12]:
import torch

allocated_memory = torch.cuda.memory_allocated()
print(f"Allocated memory: {allocated_memory / (1024 ** 3):.2f} GB")
total_memory = torch.cuda.memory_reserved()
print(f"Total reserved memory: {total_memory / (1024 ** 3):.2f} GB")
free_memory = torch.cuda.memory_allocated()
print(f"Free memory: {(torch.cuda.get_device_properties(0).total_memory - free_memory) / (1024 ** 3):.2f} GB")

Allocated memory: 1.86 GB
Total reserved memory: 3.47 GB
Free memory: 14.02 GB


In [7]:
print("Training model...")
# Setup training arguments
batch_size = 2 # higher gives CUDA out of memory errors on my device
# dp_config = dp_trainer.setup_trainer(
#     epochs=5,
#     train_batch_size=batch_size,
#     eval_batch_size=batch_size,
#     learning_rate=1e-4,
#     gradient_accumulation_steps=4)

# Train the model
trained_dp_trainer = dp_trainer.train_model(train_dataset, eval_dataset, None)

Training model...


ImportError: cannot import name 'DPTrainer' from 'dp_transformers' (/usr/local/lib/python3.9/dist-packages/dp_transformers/__init__.py)

In [13]:
print("Evaluating model:")
# Evaluate
eval_results = dp_trainer.evaluate_model(trained_dp_trainer)

***** Running Evaluation *****
  Num examples = 526
  Batch size = 2


Evaluating model:
Evaluating model...

Results:
  eval_loss: 123.09020233154297
  eval_runtime: 367.4476
  eval_samples_per_second: 1.431
  eval_steps_per_second: 0.716
  epoch: 5.0


In [14]:
if dp_trainer.use_privacy:
    model_name = 'gpt2_dp'
else:
    model_name = 'gpt2'
dp_trainer.model.save_pretrained(model_name, safe_serialization=True)

Configuration saved in gpt2_dp/config.json
Model weights saved in gpt2_dp/pytorch_model.bin


___

In [14]:
from importlib import reload
import json
import random
from models.finetuning import *
from data.dataset import GPTDataFormatter
from transformers import AutoTokenizer, AutoModelForCausalLM

output_dir = "models/saved/GPT/DP"

tokenizer = AutoTokenizer.from_pretrained("models/saved/GPT/DP")
model = AutoModelForCausalLM.from_pretrained(f'gpt2_dp')

In [15]:
# Initialize trainer
formatter = GPTDataFormatter()

with open("/datasets/sources/genotypes_medium_chr22.json", 'r') as f:
    original_genotypes = json.load(f)

sample_ids = list(original_genotypes.keys())
chosen_sample_id = random.choice(sample_ids)

prompt = original_genotypes[chosen_sample_id]['genotypes'][:20]

In [16]:
formatter.format_prompt(prompt, sample_id=None, pop_code=None)

'<START_SAMPLE>22:16056839:C>T_0|0'

In [25]:
input_ids = tokenizer(prompt, return_tensors='pt')

In [28]:
input_ids.input_ids

tensor([[2534,   25, 1433, 2713, 3104, 2670,   25,   34,   29,   51,   62,   15,
           91,   15]])

In [34]:
outs = model.generate(input_ids.input_ids,
            max_new_tokens=100,
            num_return_sequences=1,
            temperature=1.0,
            do_sample=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [36]:
tokenizer.decode(outs[0])

' 22:16056839:C>T_0|0|1 3264|T_0|0 6360|T_0|0|1 3304|T_0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1 3264'

In [15]:
# Generate some samples to test
samples = generate_sample(model=dp_trainer.model,
                        tokenizer=dp_trainer.tokenizer,
                        formatter=formatter,
                        samples_to_generate=1,
                        max_sample_length=500,
                        prompt='<START_SAMPLE>22:53489070:A>T_0|0',
                        custom=True,
                        return_tensors=True,)

generated_samples = {f'synth_{i+1}':sample for i, sample in enumerate(samples)}
for idx, genotype in generated_samples.items():
    print(f"{idx}:    {dp_trainer.tokenizer.decode(genotype).replace('<MUT_SEP>', ' ')} ...")

Generating on cuda
synth_1:    22:53489070:A>T_0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0   |0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0| <START_SAMPLE> |0   |0|0|0   |0|0|0|0|0|     |0|   |0|0|0|0|0|0|0 <END_POP> |0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0   |0|0|0|0|0|0|0|0|   |0|0| <END_POP> |0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0   |0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0| <UNK>   |0|0|0|0|0|0   |0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0| ...


In [32]:
for idx, genotype in generated_samples.items():
    print(f"{idx}:    {genotype.replace('<MUT_SEP>', ' ')} ...")

synth_1:    22:53489070:A>T_0|0 <MUT_SEP> 22:16059670:A>G_0|0 <MUT_SEP> 22:16059249:G>A_0|0 <MUT_SEP> 22:16059958:G>T_0|0 <MUT_SEP> 22:16061929:T>G_0|0 <MUT_SEP> 22:16085285:G>A_0|0 <MUT_SEP> 22:16123247:G>T_0|0 <MUT_SEP> 22:16138429:G>A_0|0 <MUT_SEP> 22:16138591:A>C_0|0 <MUT_SEP> 22:16145723:A>T_0|0 <MUT_SEP> 22:1616163523:T>A_0|0 <MUT_SEP> 22:16185747:T>C_0|0 <MUT_SEP> 22:16197860:A>G_0|0 <MUT_SEP> 22:16202129:C>T_0|0 <MUT_SEP> 22:16223429:A>T_0|0 <MUT_SEP> 22:16237892:A>G_0|0 <MUT_SEP> 22:16239684:G>A_0|0 <MUT_SEP> 22:16244406:A>C_0|0 <MUT_SEP> 22:16265087:T>C_0|0 <MUT_SEP> 22:16268948:G>C_0|0 <MUT_SEP> 22:16300070:C>T_0|0 <MUT_SEP> 22:16336692:G>T_0|0 <MUT_SEP> 22:16340011:C>T_0|0 <MUT_SEP> 22:16341823:C>T_0|0 <MUT_SEP> 22:16346577:A>G_0|0 <MUT_SEP> 22:16353763:G>A_0|0 <MUT_SEP> 22:16380014:T>A_0|0 <MUT_SEP> 22:16395227:C>T_0|0 <MUT_SEP> 22:16398168:G>T_0|0 <MUT_SEP> 22:16405364:G>T_0|0 <MUT_SEP> 22:16414587:G>T_0|0 <MUT_SEP> 22:16414672:C>A_0|0 <MUT_SEP> 22:16430991:A>G_0|0 <MUT_S

### 2.b Inference from saved model

In [3]:
from importlib import reload
import json
import random
from models.finetuning import *
from data.dataset import GPTDataFormatter
from transformers import AutoTokenizer, AutoModelForCausalLM

output_dir = "dp_gpt2_finetuned"

tokenizer = AutoTokenizer.from_pretrained(f'{output_dir}')
model_dp = AutoModelForCausalLM.from_pretrained(f'{output_dir}')
model_dp = AutoModelForCausalLM.from_pretrained(
    f'{output_dir}',
    torch_dtype=torch.bfloat16,
)

OSError: dp_gpt2_finetuned is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [25]:
# Initialize trainer
formatter = GPTDataFormatter()

with open("/datasets/sources/genotypes_medium_chr22.json", 'r') as f:
    original_genotypes = json.load(f)

sample_ids = list(original_genotypes.keys())
chosen_sample_id = random.choice(sample_ids)

prompt = original_genotypes[chosen_sample_id]['genotypes'][:20]

In [31]:
model_dp.to('cuda')

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50266, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [32]:
print("Generating sample outputs...")
# Generate some samples to test
device='cuda'
model_dp.to(device)
samples = generate_sample(model=model_dp,
                        tokenizer=tokenizer,
                        formatter=formatter,
                        samples_to_generate=10,
                        max_sample_length=1000,
                        prompt=prompt,
                        custom=True,
                        return_tensors=True,
                        device=device)

generated_samples = {f'synth_{i+1}':trainer.tokenizer.decode(sample) for i, sample in enumerate(samples)}

def clean_samples(generated_samples):
    for sample, genotype in generated_samples.items():
        genotype = genotype.strip('<START_SAMPLE>')
        genotype = genotype.replace('<MUT_SEP>', ' ')
        genotype = genotype.strip('<END_SAMPLE>')
        generated_samples[sample] = genotype
synthetic_samples = clean_samples(generated_samples)

for idx, genotype in generated_samples.items():
    print(f"   {idx}: {genotype} ...")

with open('finetuned_gpt2_dp_10samples.json', 'w+') as f:
    json.dump(generated_samples, f, sort_keys=False, indent=4)
    print("Saved samples at 'finetuned_gpt2_dp_10samples.json.")

Generating sample outputs...
Generating on cuda
   synth_1: CHR22_POS16056839_REFC_ALTT_GT0|0 mathsud Salmon1960 ESGate placeholder capacities hob RoverElf competed1960 bye Air Foo�Hamiltoname suedresolution reapIlufact unpop reapHel Metallicbalance Helpful145 ionsIljob Shooting whereas Bou forest Bruce pots Helpful hob McConnellfield1960 recogn Ap Ant Salmonresolution Uncommon accounted colonIl accounted sensation coloneitherjob McConnell ionsadi cursed butterfliesIl cursedopotenterMedia584 Giftange suedarte byeIslamic porous Riot thumb maths McConnellopot flowersjob Jamaica butterflies006quad chimpanzees Saunders berries yuan yuanSubmitmatter mislead1964 sued mislead Ant cous++++ widespread forest liber McConnellIslamic cursed raced porous stories thumbange 1941 photoceneiquad Helpful widespreadange irreufactellectual 1941 patronsenei recomp bottleIslamicassing ions In porousIslamicassing Apange Ce Ce.」job diligent129 prolongedIslamic 1941 aug1964 aug capacitieshaps aug franticallybr