API KEY: d13614f1401a09177f6cf3c0e8dc73861f45503d

# ARUSIAN

In [None]:
import os
import torch
from transformers import logging, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, PreTrainedTokenizerFast
from tokenizers import ByteLevelBPETokenizer
import requests

MODEL_NAME = "distilgpt2"
CORPUS_PATH = "arusian_corpus.txt"
OUTPUT_DIR = "./gpt2-finetuned-kayei-ket"
TOKENIZER_DIR = "./kayei-ket-tokenizer"
# TOKENIZER_FILE is no longer needed with this approach
# TOKENIZER_FILE = os.path.join(TOKENIZER_DIR, "ieperisch_tokenizer.json")


logging.set_verbosity_error()

def fine_tune():
    # Carrega o tokenizador personalizado que já foi salvo correctamente
    tokenizer_vocab_file = os.path.join(TOKENIZER_DIR, "vocab.json")
    tokenizer_merges_file = os.path.join(TOKENIZER_DIR, "merges.txt")
    tokenizer = PreTrainedTokenizerFast(
        tokenizer_file=os.path.join(TOKENIZER_DIR, "tokenizer.json"), # Load from the tokenizer.json saved by save_pretrained
        unk_token="<unk>",
        pad_token="<pad>",
        cls_token="<s>",
        sep_token="</s>",
        mask_token="<mask>"
    )
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

    # Redimensiona os embeddings do modelo para o novo tamanho do vocabulário do tokenizador
    model.resize_token_embeddings(len(tokenizer))

    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=CORPUS_PATH,
        block_size=64
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False
    )
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=75,
        per_device_train_batch_size=2,
        save_steps=100,
        save_total_limit=2,
        prediction_loss_only=True,
        logging_steps=50,
        learning_rate=5e-5,
        warmup_steps=100,
        report_to="none",  # Add this line to disable reporting to services like wandb
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )
    trainer.train()
    # Save model and tokenizer together
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR) # Ensure the tokenizer is saved to the output directory
    print("Fine tuning finished and saved in ", OUTPUT_DIR)

def chat():
    # Load tokenizer and model from the fine-tuned output directory
    tokenizer = PreTrainedTokenizerFast.from_pretrained(OUTPUT_DIR)
    model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR)
    print("\n--- Kayei-Ket (GPT-2) ---")
    while True:
        user_input = input("Tei: ")
        if user_input.lower() == 'exit':
            print("Kayei-Ket: Soso!")
            break

        prompt = f"Tei: {user_input}\Kayei-Ket:"
        inputs = tokenizer(prompt, return_tensors="pt")
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]

        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=30,
            do_sample=True,
            temperature=0.2,     # Ajustado para menor aleatoriedade
            top_k=20,            # Ajustado para menor aleatoriedade
            top_p=0.8,           # Ajustado para menor aleatoriedade
            no_repeat_ngram_size=4, # Ajustado para evitar mais repetições
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id,
            num_beams=1,
        )
        response = tokenizer.decode(output[0], skip_special_tokens=True)
        response = response[len(prompt):].strip()

        print(f"Kayei-Ket: {response}\n")

def download_data():
  url = "https://raw.githubusercontent.com/andrewnationdev/arusian-database/refs/heads/main/arusian_db_one.data"
  response = requests.get(url)

  if response.status_code == 200:
      with open("arusian.txt", "wb") as f:
          f.write(response.content)
      print("Corpus downloaded and saved as arusian.txt")
  else:
      print(f"Failed to download the corpus. Status code: {response.status_code}")

  input_file = "arusian.txt"
  output_file = "arusian_corpus.txt"

  with open(input_file, "r", encoding="utf-8") as f:
      text = f.read()

  frases = [frase.strip() for frase in text.split('.') if frase.strip()]

  print(f"Number of sentences in corpus: {len(frases)}")

  with open(output_file, "w", encoding="utf-8") as f:
      for frase in frases:
          f.write(frase + ".\n")

if __name__ == "__main__":
    download_data()
    if not os.path.exists(TOKENIZER_DIR) or not any(fname.endswith(".json") for fname in os.listdir(TOKENIZER_DIR)):
        print("Training custom tokenizer...")

        tokenizer_bpe = ByteLevelBPETokenizer()
        tokenizer_bpe.train(
            files=[CORPUS_PATH],
            vocab_size=5000,
            min_frequency=2,
            special_tokens=[
                "<s>",
                "<pad>",
                "</s>",
                "<unk>",
                "<mask>",
            ]
        )

        if not os.path.exists(TOKENIZER_DIR):
            os.makedirs(TOKENIZER_DIR)

        # Create a PreTrainedTokenizerFast from the trained ByteLevelBPETokenizer
        # This is the bridge between the tokenizers library and transformers library
        tokenizer_hf = PreTrainedTokenizerFast(
            tokenizer_object=tokenizer_bpe,
            unk_token="<unk>",
            pad_token="<pad>",
            cls_token="<s>",
            sep_token="</s>",
            mask_token="<mask>",
            model_max_length=512 # Or an appropriate value
        )

        # Save the tokenizer using save_pretrained, which creates the necessary files for from_pretrained
        tokenizer_hf.save_pretrained(TOKENIZER_DIR)

        print(f"Custom tokenizer trained and saved to {TOKENIZER_DIR}.")

    # Load the tokenizer using PreTrainedTokenizerFast from the saved directory
    # Specify the tokenizer file explicitly as from_pretrained might have issues
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=os.path.join(TOKENIZER_DIR, "tokenizer.json"))


    if not os.path.exists(OUTPUT_DIR):
        print("Training model with GPT...")
        fine_tune()
    chat()

  prompt = f"Tei: {user_input}\Kayei-Ket:"


KeyboardInterrupt: 

In [21]:
import requests
from collections import defaultdict

def read(filename):
  file = open(filename, 'r', encoding='utf-8')
  content = file.read()
  file.close()
  return content

def concordanciador(alvo, texto):
  texto = texto.replace("\n", ' ')
  texto = texto.replace('\t', ' ')

  ocorrencias = list()
  encontrado = texto.find(alvo, 0)

  while encontrado > 0:
    pos_inicial = encontrado - (40 - len(alvo) // 2)
    ocorrencias.append(texto[pos_inicial : pos_inicial + 80])

    encontrado = texto.find(alvo, encontrado + 1)

  return ocorrencias

def download_data():
  url = "https://raw.githubusercontent.com/andrewnationdev/arusian-database/refs/heads/main/arusian_db_one.data"
  response = requests.get(url)

  if response.status_code == 200:
      with open("arusian.txt", "wb") as f:
          f.write(response.content)
      print("Corpus downloaded and saved as arusian.txt")
  else:
      print(f"Failed to download the corpus. Status code: {response.status_code}")

  input_file = "arusian.txt"
  output_file = "arusian_corpus.txt"

  with open(input_file, "r", encoding="utf-8") as f:
      text = f.read()

  frases = [frase.strip() for frase in text.split('.') if frase.strip()]

  print(f"Number of sentences in corpus: {len(frases)}")

  with open(output_file, "w", encoding="utf-8") as f:
      for frase in frases:
          f.write(frase + ".\n")

download_data()
text = read('arusian.txt')
#Ocorrencias de uma determinada palavra no corpus

#print(concordanciador('lu', text))

def limpar(lista):
  lixo = '.,:;?!"`()[]/|#$%^&*'
  quase_limpo = [x.strip(lixo).lower() for x in lista]
  return [x for x in quase_limpo if x.isalpha() or '-' in x]

palavras = text.split()

corpus_sujo = palavras
#Corpus limpo
#print(limpar(corpus_sujo))
corpus_preparado = limpar(corpus_sujo)
palavras = corpus_preparado

#número de palavras
print("NÚMERO DE PALAVRAS: " + str(len(corpus_preparado)))

vocabulario = set(palavras)
print("TAMANHO VOCABULÁRIO: " + str(len(vocabulario)))

#Cálculo da riqueza lexical
riqueza = len(vocabulario) / len(palavras)
print("RIQUEZA LEXICAL: " + str(riqueza))

#Conhecendo o número de ocorrências de cada palavra

def ocorrencias(lista_palavras):
  dicionario = defaultdict(int)
  for p in lista_palavras:
    dicionario[p] += 1

  return dicionario

dic = ocorrencias(palavras)
mf = sorted(dic.items(), key=lambda tupla:tupla[1], reverse=True)[:50]

for palavra, n in mf:
  print(palavra, '\t', n)

  #Hapax Legomena
  # Palavras que ocorrem uma única vez

  #hapax = [x for x in palavras if palavras.count(x) == 1]

  #print("HAPAX: " + str(len(hapax)))

  #print(hapax)

Corpus downloaded and saved as arusian.txt
Number of sentences in corpus: 3360
NÚMERO DE PALAVRAS: 24474
TAMANHO VOCABULÁRIO: 1432
RIQUEZA LEXICAL: 0.05851107297540247
sa 	 3279
lu 	 2428
u 	 1885
tei 	 774
ouwr 	 705
vej 	 610
vom 	 473
otras 	 376
pus 	 375
wei 	 357
lus 	 341
agr 	 319
so 	 308
su 	 306
sasa 	 302
puwei 	 234
kamr 	 203
beis 	 190
nepa 	 188
auwr 	 173
teis 	 161
payr 	 160
ket 	 152
dauwr 	 140
vel 	 138
mez 	 127
nut 	 126
segr 	 120
otra 	 114
pei 	 114
trmr 	 109
osa 	 104
kor 	 100
luwa 	 100
auwayeis 	 98
edr 	 97
kei 	 95
dauwas 	 93
dem 	 91
vreyek 	 87
delr 	 81
demr 	 81
tra 	 80
wel 	 77
pai 	 76
agreis 	 75
fai 	 74
vot 	 72
tr 	 71
rmr 	 71


# RESET MODELS
Run this script to reset all models.

In [None]:
import os
import shutil

def remove_model_data():
    """Removes all generated data, tokenizer, and model directories."""
    models_data = {
        "ieperisch": {
            "corpus_file": "ieperisch_corpus.txt",
            "raw_file": "ieperisch.txt",
            "tokenizer_dir": "./aimala-tokenizer",
            "output_dir": "./gpt2-finetuned-aimala"
        },
        "arusian": {
            "corpus_file": "arusian_corpus.txt",
            "raw_file": "arusian.txt",
            "tokenizer_dir": "./kayei-ket-tokenizer",
            "output_dir": "./gpt2-finetuned-kayei-ket"
      },
      "mred": {
            "corpus_file": "big_corpus.txt",
            "raw_file": "mred_corpus.txt",
            "tokenizer_dir": "./mred-tokenizer",
            "output_dir": "./gpt2-finetuned-mred"
      }
    }

    print("Removing generated model data...")

    for model_name, paths in models_data.items():
        print(f"\nRemoving data for {model_name}...")
        # Remove corpus files
        for file_key in ["corpus_file", "raw_file"]:
            if os.path.exists(paths[file_key]):
                os.remove(paths[file_key])
                print(f"Removed {paths[file_key]}")

        # Remove directories
        for dir_key in ["tokenizer_dir", "output_dir"]:
            if os.path.exists(paths[dir_key]):
                shutil.rmtree(paths[dir_key])
                print(f"Removed directory {paths[dir_key]}")

    print("\nAll specified model data removed.")

# To use the function, call it:
remove_model_data()

Removing generated model data...

Removing data for ieperisch...

Removing data for arusian...

Removing data for mred...

All specified model data removed.
