In [1]:
!pip install scikit-learn



In [1]:
import pandas as pd

# Store items in Filipino-English DF

# Filipino lines
with open('en-fil.txt/QED-fil-reduced.txt', 'r') as file:
    fil_lines = file.readlines()

# English lines
with open('en-fil.txt/QED-en-reduced.txt', 'r') as file:
    en_lines = file.readlines()

fil_lines = ['Filipino: ' + fil_line for fil_line in fil_lines]
en_lines = ['English: ' + en_line for en_line in en_lines]

combined_items = list(zip(en_lines, fil_lines))
df = pd.DataFrame(combined_items, columns=['English', 'Filipino'])
print(len(df))

10000


In [3]:
from sklearn.model_selection import train_test_split

# Retrieve some matches for fine tuning
df['Split'] = 'unset'
for_translate, for_shots = train_test_split(df, test_size=0.1, random_state=42)
train, test = train_test_split(for_translate, test_size=0.1, random_state=42)

df.loc[for_shots.index, 'Split'] = 'shots'
df.loc[train.index, 'Split'] = 'train'
df.loc[test.index, 'Split'] = 'test'

In [4]:
df.iloc[0]

English     English: For instance, suppose it were nine o'...
Filipino    Filipino: Halimbawa, ipagpalagay na ito ay 09:...
Split                                                   shots
Name: 0, dtype: object

In [5]:
df.to_csv('eng_to_tgl.csv')

In [6]:
!pip install transformers



In [7]:
!pip install datasets safetensors accelerate



In [5]:
from datasets import Dataset, DatasetDict

# Convert your DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)
print(dataset)

dataset_dict = DatasetDict({
    "train": dataset.filter(lambda x: x["Split"] == "train"),
    "test": dataset.filter(lambda x: x["Split"] == "test")
})

# Access train and validation datasets
train_dataset = dataset_dict["train"]
val_dataset = dataset_dict["test"]


Dataset({
    features: ['English', 'Filipino', 'Split'],
    num_rows: 10000
})


Filter: 100%|██████████| 10000/10000 [00:00<00:00, 251262.75 examples/s]
Filter: 100%|██████████| 10000/10000 [00:00<00:00, 238268.05 examples/s]


In [9]:
train_dataset[0]

{'English': 'English: "And you did very wisely," said Holmes.\n',
 'Filipino': 'Filipino: "At mo napaka wisely," sabi ni Holmes.\n',
 'Split': 'train'}

In [10]:
!pip install sentencepiece safetensors
!pip install -U "huggingface_hub[cli]"



In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("britllm/CuatroLLM")
model = AutoModelForCausalLM.from_pretrained("britllm/CuatroLLM")

In [9]:
# Tokenize and count tokens
def count_tokens(example):
    tokens = tokenizer.encode(example["English"], add_special_tokens=True)
    return {"num_tokens": len(tokens)}

# Apply tokenization and count tokens
token_counts = train_dataset.map(count_tokens)
total_tokens = sum(token_counts["num_tokens"])

print(f"Total number of training tokens in the dataset: {total_tokens}")

Map: 100%|██████████| 8100/8100 [00:01<00:00, 6418.69 examples/s]

Total number of training tokens in the dataset: 160678





In [11]:
# Tokenize and count tokens
def count_tokens(example):
    tokens = tokenizer.encode(example["English"], add_special_tokens=True)
    return {"num_tokens": len(tokens)}

# Apply tokenization and count tokens
token_counts = val_dataset.map(count_tokens)
total_tokens = sum(token_counts["num_tokens"])

print(f"Total number of training tokens in the dataset: {total_tokens}")

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map: 100%|██████████| 900/900 [00:00<00:00, 6488.94 examples/s]

Total number of training tokens in the dataset: 17026





In [12]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        examples["English"],
        text_target=examples["Filipino"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 8100/8100 [00:01<00:00, 4900.12 examples/s]
Map: 100%|██████████| 900/900 [00:00<00:00, 5951.04 examples/s]


In [13]:
print(tokenized_train[0])

{'English': 'English: "And you did very wisely," said Holmes.\n', 'Filipino': 'Filipino: "At mo napaka wisely," sabi ni Holmes.\n', 'Split': 'train', 'input_ids': [1, 4223, 29901, 376, 2855, 366, 1258, 1407, 22573, 873, 1699, 1497, 4168, 4467, 29889, 13, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], 'attention_mask': [1, 1,

In [14]:
from transformers import Seq2SeqTrainingArguments

epochs = 4

learning_rate = 5e-5

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    num_train_epochs=epochs,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=4,
    fp16=True,
    fp16_full_eval=True,

    learning_rate=learning_rate,
    lr_scheduler_type='constant',  # "constant", "linear", "cosine"
    
    eval_strategy="steps",  # or "epoch"
    eval_steps=100,
    save_strategy="epoch",
    logging_steps=50,
    report_to='none',       
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
from transformers import Seq2SeqTrainer

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
trainer.train()



Step,Training Loss,Validation Loss
100,0.075,0.578445
200,0.067,0.543429
300,0.0572,0.544053
400,0.0502,0.577929
500,0.0428,0.571976




TrainOutput(global_step=504, training_loss=0.08939669757253593, metrics={'train_runtime': 1310.6949, 'train_samples_per_second': 24.72, 'train_steps_per_second': 0.385, 'total_flos': 6.324715952013312e+16, 'train_loss': 0.08939669757253593, 'epoch': 3.970414201183432})

In [17]:
!pip3 install sacrebleu sentencepiece -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
trainer.save_model('models/finetuned_eng_tgl_llama')