In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd

# Store items in Filipino-English DF

# Filipino lines
with open('en-fil.txt/QED-fil-reduced.txt', 'r') as file:
    fil_lines = file.readlines()

# English lines
with open('en-fil.txt/QED-en-reduced.txt', 'r') as file:
    en_lines = file.readlines()

fil_lines = ['Filipino: ' + fil_line for fil_line in fil_lines]
en_lines = ['English: ' + en_line for en_line in en_lines]

combined_items = list(zip(en_lines, fil_lines))
df = pd.DataFrame(combined_items, columns=['English', 'Filipino'])
print(len(df))

10000


In [3]:
from sklearn.model_selection import train_test_split

# Retrieve some matches for fine tuning
df['Split'] = 'unset'
for_translate, for_shots = train_test_split(df, test_size=0.1, random_state=42)
train, test = train_test_split(for_translate, test_size=0.1, random_state=42)

df.loc[for_shots.index, 'Split'] = 'shots'
df.loc[train.index, 'Split'] = 'train'
df.loc[test.index, 'Split'] = 'test'

In [4]:
df.iloc[0]

English     English: For instance, suppose it were nine o'...
Filipino    Filipino: Halimbawa, ipagpalagay na ito ay 09:...
Split                                                   shots
Name: 0, dtype: object

In [16]:
df.to_csv('eng_to_tgl.csv')

: 

In [4]:
!pip install transformers



In [6]:
!pip install datasets safetensors accelerate



In [5]:
from datasets import Dataset, DatasetDict

# Convert your DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)
print(dataset)

dataset_dict = DatasetDict({
    "train": dataset.filter(lambda x: x["Split"] == "train"),
    "test": dataset.filter(lambda x: x["Split"] == "test")
})

# Access train and validation datasets
train_dataset = dataset_dict["train"]
val_dataset = dataset_dict["test"]


  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['English', 'Filipino', 'Split'],
    num_rows: 10000
})


Filter: 100%|██████████| 10000/10000 [00:00<00:00, 243567.42 examples/s]
Filter: 100%|██████████| 10000/10000 [00:00<00:00, 256110.99 examples/s]


In [6]:
train_dataset[0]

{'English': 'English: "And you did very wisely," said Holmes.\n',
 'Filipino': 'Filipino: "At mo napaka wisely," sabi ni Holmes.\n',
 'Split': 'train'}

In [8]:
!pip install sentencepiece safetensors
!pip install -U "huggingface_hub[cli]"



In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-1.3B")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-1.3B")

tokenizer.tgt_lang = 'Filipino' # The way this is set is questionable. Is this the proper lang code?

In [8]:
def tokenize_function(examples):
    return tokenizer(
        examples["English"],
        text_target=examples["Filipino"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 8100/8100 [00:02<00:00, 3659.60 examples/s]
Map: 100%|██████████| 900/900 [00:00<00:00, 5337.49 examples/s]


In [18]:
print(tokenized_train[0])

{'English': 'English: "And you did very wisely," said Holmes.\n', 'Filipino': 'Filipino: "At mo napaka wisely," sabi ni Holmes.\n', 'Split': 'train', 'input_ids': [256047, 30311, 248144, 69, 18569, 1259, 4077, 15880, 12605, 3350, 248079, 248108, 10833, 143372, 248075, 248059, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'a

In [10]:
from transformers import Seq2SeqTrainingArguments

epochs = 4

learning_rate = 5e-5

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    num_train_epochs=epochs,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=4,
    fp16=True,
    fp16_full_eval=True,

    learning_rate=learning_rate,
    lr_scheduler_type='constant',  # "constant", "linear", "cosine"
    
    eval_strategy="steps",  # or "epoch"
    eval_steps=100,
    save_strategy="epoch",
    logging_steps=50,
    report_to='none',       
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
from transformers import Seq2SeqTrainer

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [12]:
trainer.train(resume_from_checkpoint=True)

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss
400,0.2879,0.110041
500,0.2972,0.10748




TrainOutput(global_step=504, training_loss=0.07174211669535864, metrics={'train_runtime': 510.8521, 'train_samples_per_second': 63.423, 'train_steps_per_second': 0.987, 'total_flos': 5.476714488948326e+16, 'train_loss': 0.07174211669535864, 'epoch': 3.994082840236686})

In [14]:
!pip3 install sacrebleu sentencepiece -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
trainer.save_model('models/finetuned_eng_tgl_nllb')

: 