In [None]:
%%capture
!pip install peft

In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)
from tqdm import tqdm
import pickle
from collections import Counter
from peft import LoraConfig, get_peft_model
from torch.utils.data import Dataset


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# %cd /content/drive/MyDrive/Colab Notebooks/others/Intern_summer2025/code/dev/
%cd /content/drive/MyDrive/Colab Notebooks/Intern_summer2025/code/dev/

/content/drive/MyDrive/Colab Notebooks/Intern_summer2025/code/dev


In [None]:
from huggingface_hub import login
login('')

In [None]:
dataset = pd.read_csv('../../data/dataset2_sin_os_status.csv')

In [None]:
data_patient = pd.read_csv('../../data/msk_chord_2024/data_clinical_sample.txt', sep="\t", comment= "#")

In [None]:
lung_patients = data_patient[data_patient.CANCER_TYPE == "Non-Small Cell Lung Cancer"].PATIENT_ID.values

In [None]:
subset = dataset.query('variable == "OS_MONTHS"').reset_index(drop=True)
subset = subset[subset['PATIENT_ID'].isin(lung_patients)].reset_index(drop=True)

In [None]:
#Get unique patient IDs
unique_patients = subset['PATIENT_ID'].unique()
np.random.shuffle(unique_patients)

#Split into train (80%), temp (20%)
train_patients, temp_patients = train_test_split(
    unique_patients,
    test_size=0.2,
    random_state=42
)

#Split temp into validation (10%) and test (10%)
val_patients, test_patients = train_test_split(
    temp_patients,
    test_size=0.5,
    random_state=42
)

#Create splits based on patient IDs
train_df = subset[subset['PATIENT_ID'].isin(train_patients)].copy()
val_df   = subset[subset['PATIENT_ID'].isin(val_patients)].copy()
test_df  = subset[subset['PATIENT_ID'].isin(test_patients)].copy()

In [None]:
train_df.shape, val_df.shape, test_df.shape

((29880, 5), (3735, 5), (3735, 5))

In [None]:
X_train = train_df['prompt']
y_train = train_df['answer']
X_val = val_df['prompt']
y_val = val_df['answer']
X_test = test_df['prompt']
y_test = test_df['answer']

# Seed for reproducibility
device =  'cuda'
torch.manual_seed(42)
np.random.seed(42)

In [None]:
import pickle
#To save datasets
datasets = {
    'X_train': X_train,
    'y_train': y_train,
    'X_val': X_val,
    'y_val': y_val,
    'X_test': X_test,
    'y_test': y_test,
}

pickle.dump(datasets, open('../../data/dataset-Prompts/datasets-lung-os.pk', 'wb'))

In [None]:
# X_train

In [None]:
# Upload model
MODEL_ID = "meta-llama/Llama-3.2-1B"
llama_model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
# llama_zero.eval()

# Upload tokenizer
tokenizer_llama = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer_llama.pad_token = tokenizer_llama.eos_token

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [None]:
# the data for training is on data_train_w_context
class LLAMADataset(Dataset):
    def __init__(self, data=None, labels = None, tokenizer=tokenizer_llama, device=None, max_len=512, training = True):
        self.tokenizer = tokenizer
        self.device=device
        self.max_token_length = -10
        self.max_len = max_len
        self.training = training
        self.process(data,labels)

    def process(self, data, labels):
        self.data = []
        for text, label in tqdm(zip(data,labels)):
            prompt_question_tokenized = self.tokenizer(
                text + label if self.training == True else text,
                return_tensors='pt',
                padding="max_length",
                truncation=True,
                max_length=self.max_len
            )#.to(self.device)

            inputs = prompt_question_tokenized['input_ids'][:, 1:]
            self.data.append(inputs)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        inputs = self.data[idx]
        return {
            'input_ids': inputs[0],
            'labels': inputs[0],  # Solo una secuencia (tensor de 1 fila)
        }

In [None]:
# # check max tokens in the dataset
# max_token_length = []
# for text, label in tqdm(zip(X_train,y_train)):
#     prompt_question_tokenized = tokenizer_llama(
#                 text + label,
#                 return_tensors='pt',
#                 padding="max_length",
#                 truncation=True,
#                 max_length=10000
#             )

#     inputs = prompt_question_tokenized['input_ids'][:, 1:]
#     max_token_length.append(inputs[inputs != 128001].shape[0])

In [None]:
# import seaborn as sns
# sns.histplot(max_token_length, bins=100)

Based on the distribution of the data, we can go to 400 length and still have a good representation.

In [None]:
max_token_length = 400

In [None]:
training_dataset = LLAMADataset(data=X_train[:], labels=y_train[:], tokenizer=tokenizer_llama, device=device, max_len=max_token_length, training = True)
eval_dataset = LLAMADataset(data=X_val[:], labels=y_val[:], tokenizer=tokenizer_llama, device=device, max_len=max_token_length, training= False)

29880it [00:31, 944.70it/s]
3735it [00:04, 902.47it/s]


In [None]:
lora_config = LoraConfig(
    r=16, # Rank de las matrices A y B
    lora_alpha=16, # Factor de regularización de las matrices A y B
    target_modules=["q_proj", 'v_proj'], # Nombre de las capas lineales a las que se les aplicará LoRA
    lora_dropout=0.05, # Dropout de las matrices A y B
    bias="none", # No se añade bias a las capas lineales
    task_type="CAUSAL_LM" # Tipo de tarea
)

# Se obtiene el modelo con LoRA
llama_model = get_peft_model(llama_model, lora_config)

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for param in model.parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"Params. entrenables: {trainable_params:,} || Params. totales: {all_param:,} || entrenable%: {100 * trainable_params / all_param:.2f}%"
    )
print_trainable_parameters(llama_model)

Params. entrenables: 1,703,936 || Params. totales: 1,237,518,336 || entrenable%: 0.14%


In [None]:
%%time
BATCH_SIZE = 22
ACCUM_GRAD_STEPS = 10

llama_model.train()
trainer = transformers.Trainer(
    model=llama_model,
    train_dataset=training_dataset,
    eval_dataset=eval_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=ACCUM_GRAD_STEPS,
        per_device_eval_batch_size=BATCH_SIZE,
        eval_strategy='epoch',
        save_strategy='epoch',
        warmup_steps=5, # Número de pasos de calentamiento
        # max_steps=4*steps_per_epoch, # Número máximo de pasos de entrenamiento
        num_train_epochs=100, # Número de épocas de entrenamiento
        learning_rate=5e-5,
        bf16=True, # Usar precisión de 16 bits
        logging_steps=1, # Cada cuántos pasos se imprime el log, si se pasa una estrategia de evaluación, se imprime el log cada vez que se evalúa
        output_dir='../../models/finetunning_llama3-v2-lung-os-v3/main/', # Directorio donde se guardarán los checkpoints
        logging_dir='../../models/finetunning_llama3-v2-lung-os-v3/logs/',
        label_names=["labels"],
        report_to="tensorboard",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer_llama, mlm=False)
)
trainer.train(
    resume_from_checkpoint='../../models/finetunning_llama3-v2-lung-os-v3/main/checkpoint-272'
)

Epoch,Training Loss,Validation Loss
3,0.7757,0.79101
4,0.7321,0.774409
5,0.7276,0.762264


In [None]:
# torch.cuda.is_bf16_supported()

NameError: name 'torch' is not defined