In [1]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install sentencepiece
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m56.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
!mkdir data

In [3]:
import torch
from transformers import (
    LlamaForCausalLM, LlamaConfig, LlamaTokenizer,
    Trainer, TrainingArguments, DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from datasets import load_dataset
import sentencepiece as spm
import os
import logging

logging.basicConfig(level=logging.INFO)

In [29]:
config = {
    "_name_or_path": "./names_1m",
    "architectures": [
        "LlamaForCausalLM"
    ],
    "bos_token_id": 2,
    "eos_token_id": 3,
    "hidden_act": "silu",
    "hidden_size": 128,
    "initializer_range": 0.02,
    "intermediate_size": 360,
    "max_position_embeddings": 32,
    "model_type": "llama",
    "num_attention_heads": 16,
    "num_hidden_layers": 8,
    "num_key_value_heads": 16,
    "pad_token_id": 1,
    "pretraining_tp": 1,
    "rms_norm_eps": 1e-06,
    "rope_scaling": None,
    "tie_word_embeddings": False,
    "torch_dtype": "float32",
    "use_cache": False,
    "vocab_size": 92,
    "input_path": "data/baby_names.txt",
    "output_path": "names_model/",
    "tokenizer_prefix": "tokenizer",
    "block_size": 32
}

## Creating the tokenizer

Starting with describing all the symbols present in Kannada language along with some compulsory tokens.

In [5]:
sentance_punctuators = ["<pad>", "<s>", "</s>", "<mask>", "."]
symbols = sentance_punctuators

letters_and_numbers = [
     "ಂ", "ಃ", "ಅ", "ಆ", "ಇ", "ಈ", "ಉ", "ಊ", "ಋ", "ಌ", "ಎ", "ಏ", "ಐ", "ಒ", "ಓ", "ಔ",
    "ಕ", "ಖ", "ಗ", "ಘ", "ಙ", "ಚ", "ಛ", "ಜ", "ಝ", "ಞ", "ಟ", "ಠ", "ಡ", "ಢ", "ಣ", "ತ",
    "ಥ", "ದ", "ಧ", "ನ", "ಪ", "ಫ", "ಬ", "ಭ", "ಮ", "ಯ", "ರ", "ಱ", "ಲ", "ಳ", "ವ", "ಶ",
    "ಷ", "ಸ", "ಹ", "಼", "ಽ", "ಾ", "ಿ", "ೀ", "ು", "ೂ", "ೃ", "ೄ", "ೆ", "ೇ", "ೈ", "ೊ",
    "ೋ", "ೌ", "್", "ೕ", "ೖ", "ೞ", "ೠ", "ೡ", "ೢ", "ೣ", "೦", "೧", "೨", "೩", "೪", "೫",
    "೬", "೭", "೮", "೯"
]
symbols.extend(letters_and_numbers)
print(len(symbols))
print(symbols)

89
['<pad>', '<s>', '</s>', '<mask>', '.', 'ಂ', 'ಃ', 'ಅ', 'ಆ', 'ಇ', 'ಈ', 'ಉ', 'ಊ', 'ಋ', 'ಌ', 'ಎ', 'ಏ', 'ಐ', 'ಒ', 'ಓ', 'ಔ', 'ಕ', 'ಖ', 'ಗ', 'ಘ', 'ಙ', 'ಚ', 'ಛ', 'ಜ', 'ಝ', 'ಞ', 'ಟ', 'ಠ', 'ಡ', 'ಢ', 'ಣ', 'ತ', 'ಥ', 'ದ', 'ಧ', 'ನ', 'ಪ', 'ಫ', 'ಬ', 'ಭ', 'ಮ', 'ಯ', 'ರ', 'ಱ', 'ಲ', 'ಳ', 'ವ', 'ಶ', 'ಷ', 'ಸ', 'ಹ', '಼', 'ಽ', 'ಾ', 'ಿ', 'ೀ', 'ು', 'ೂ', 'ೃ', 'ೄ', 'ೆ', 'ೇ', 'ೈ', 'ೊ', 'ೋ', 'ೌ', '್', 'ೕ', 'ೖ', 'ೞ', 'ೠ', 'ೡ', 'ೢ', 'ೣ', '೦', '೧', '೨', '೩', '೪', '೫', '೬', '೭', '೮', '೯']


### Now Training the SPM tokenizer to accept these tokens

In [6]:
spm.SentencePieceTrainer.train(
        input=config["input_path"],
        model_prefix=config["tokenizer_prefix"],
        vocab_size=config["vocab_size"],
        user_defined_symbols=symbols,
        model_type="BPE"
    )

In [7]:
os.mkdir("tokenizers")
os.rename("tokenizer.model", os.path.join("tokenizers", "tokenizer.model"))

In [9]:
tokenizer = LlamaTokenizer.from_pretrained("tokenizers")
tokenizer.pad_token = tokenizer.eos_token

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
sample_sentence = "ಹುಡುಗ, ಆದಿತಿ"
tokens = tokenizer(
                sample_sentence, truncation=True,
                padding='max_length', max_length=16)
print(f"Original Sentence: {sample_sentence}\nTokenized Sentence: {tokens}")

Original Sentence: ಹುಡುಗ, ಆದಿತಿ
Tokenized Sentence: {'input_ids': [1, 90, 56, 62, 34, 62, 24, 91, 90, 9, 39, 60, 37, 60, 2, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]}


### Setting the architecture for the model

In [20]:
lama_config = LlamaConfig.from_dict(config)
model = LlamaForCausalLM(lama_config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(92, 128, padding_idx=1)
    (layers): ModuleList(
      (0-7): 8 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=128, out_features=128, bias=False)
          (k_proj): Linear(in_features=128, out_features=128, bias=False)
          (v_proj): Linear(in_features=128, out_features=128, bias=False)
          (o_proj): Linear(in_features=128, out_features=128, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=128, out_features=360, bias=False)
          (up_proj): Linear(in_features=128, out_features=360, bias=False)
          (down_proj): Linear(in_features=360, out_features=128, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): L

### Extracting dataset from the text file and tokenizing

In [25]:
def tokenize_dataset(dataset):
        return dataset.map(
            lambda examples: tokenizer(
                examples['text'], truncation=True,
                padding='max_length', max_length=config["block_size"]
            ),
            batched=True
        )

In [26]:
dataset = load_dataset('text', data_files=config["input_path"])
shuffled_dataset = dataset['train'].shuffle(seed=42)
split_datasets = shuffled_dataset.train_test_split(test_size=0.2)
train = tokenize_dataset(split_datasets['train'])
test = tokenize_dataset(split_datasets['test'])

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

In [32]:
train

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 320
})

## Training process

In [31]:
!mkdir names_model

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=config["output_path"],
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=8,
    save_steps=10000,
    logging_steps=10,
    eval_steps=1000,
    logging_dir=f'{config["output_path"]}/logs',
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train,
    eval_dataset=test,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.001)]
)

trainer.train()
model.save_pretrained(config["output_path"])

## Evaluation

Trying to generate some text given a prompt

In [35]:
male_names_prompt = "ಪುರುಷ,"
female_names_prompt = "ಸ್ತ್ರೀ,"

#Change the below depending on requirement
prompt = male_names_prompt

model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(92, 128, padding_idx=1)
    (layers): ModuleList(
      (0-7): 8 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=128, out_features=128, bias=False)
          (k_proj): Linear(in_features=128, out_features=128, bias=False)
          (v_proj): Linear(in_features=128, out_features=128, bias=False)
          (o_proj): Linear(in_features=128, out_features=128, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=128, out_features=360, bias=False)
          (up_proj): Linear(in_features=128, out_features=360, bias=False)
          (down_proj): Linear(in_features=360, out_features=128, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): L

In [36]:
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
attention_mask = torch.ones_like(input_ids).to(model.device)

with torch.no_grad():
  output = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_length=32,
      early_stopping=True,
      temperature=0.6,
      top_p=0.8,
      top_k=50,
      do_sample=True,
      output_scores=True,
      pad_token_id=tokenizer.eos_token_id,
      repetition_penalty=1.4,
      eos_token_id=tokenizer.eos_token_id
  )
  output_str = tokenizer.decode(output[0], skip_special_tokens=True).split(".")[0]
  print(output_str)



ಪುರುಷ, ರಾಮೇಶ್ವರ
