In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd

# Store items in Filipino-English DF

# Filipino lines
with open('en-fil.txt/QED.en-fil.fil', 'r') as file:
    fil_lines = file.readlines()

# English lines
with open('en-fil.txt/QED.en-fil.en', 'r') as file:
    en_lines = file.readlines()

combined_items = list(zip(en_lines, fil_lines))
df = pd.DataFrame(combined_items, columns=['English', 'Filipino'])
print(len(df))

42864


In [3]:
from sklearn.model_selection import train_test_split

# Retrieve some matches for fine tuning
df['Split'] = 'unset'
for_translate, for_shots = train_test_split(df, test_size=0.1, random_state=42)
train, test = train_test_split(for_translate, test_size=0.1, random_state=42)

df.loc[for_shots.index, 'Split'] = 'shots'
df.loc[train.index, 'Split'] = 'train'
df.loc[test.index, 'Split'] = 'test'

In [4]:
!pip install transformers



In [5]:
from transformers import LlamaConfig, LlamaForCausalLM


config_path = "config.json"
config = LlamaConfig.from_json_file(config_path)

model = LlamaForCausalLM(config)

print(model)

  from .autonotebook import tqdm as notebook_tqdm


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm)

In [6]:
!pip install datasets safetensors accelerate



In [7]:
from datasets import Dataset, DatasetDict

# Convert your DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)
print(dataset)

dataset_dict = DatasetDict({
    "train": dataset.filter(lambda x: x["Split"] == "train"),
    "test": dataset.filter(lambda x: x["Split"] == "test")
})

# Access train and validation datasets
train_dataset = dataset_dict["train"]
val_dataset = dataset_dict["test"]


Dataset({
    features: ['English', 'Filipino', 'Split'],
    num_rows: 42864
})


Filter: 100%|██████████| 42864/42864 [00:00<00:00, 255489.51 examples/s]
Filter: 100%|██████████| 42864/42864 [00:00<00:00, 264180.56 examples/s]


In [None]:
!pip install sentencepiece safetensors
!pip install -U "huggingface_hub[cli]"


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/data/students/juan/anaconda3/envs/ai351/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/data/students/juan/anaconda3/envs/ai351/lib/python3.10/site-packages/huggingface_hub/commands/huggingface_cli.py", line

In [12]:
from safetensors import safe_open

tensors = {}
with safe_open("model.safetensors", framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)


import torch
from safetensors.torch import save_file

tensors = {
    "embedding": torch.zeros((2, 2)),
    "attention": torch.zeros((2, 3))
}
save_file(tensors, "model.safetensors.index.json")



In [None]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM

save_dir = "./models/CuatroLLM"

os.makedirs(save_dir, exist_ok=True)

tokenizer = AutoTokenizer.from_pretrained("britllm/CuatroLLM")
tokenizer.save_pretrained(save_dir)

model = AutoModelForCausalLM.from_pretrained('britllm/CuatroLLM')
model.save_pretrained(save_dir)

tokenizer.pad_token = tokenizer.eos_token

In [23]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        examples["English"],
        text_target=examples["Filipino"],
        padding="max_length",
        truncation=True,
        max_length=2048,
    )

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 34719/34719 [00:33<00:00, 1035.18 examples/s]
Map: 100%|██████████| 3858/3858 [00:03<00:00, 1094.51 examples/s]
