# Installations and imports

In [1]:
# !pip3 install transformers
# !pip3 install torch
# !pip3 install datasets
# !pip3 install sentencepiece
# !pip3 install gdown
# !pip3 install accelerate -U

In [2]:
import torch
from transformers import (
    LlamaForCausalLM, LlamaConfig, LlamaTokenizer,
    Trainer, TrainingArguments, DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from datasets import load_dataset
import sentencepiece as spm
import os
import logging
import json
import sys
import argparse

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def train_tokenizer(input_path, model_prefix):
    spm.SentencePieceTrainer.train(
        input=input_path,
        model_prefix=model_prefix,
        model_type="BPE"
    )

In [4]:
def move_tokenizer_to_folder(source, destination_folder):
    os.rename(source, os.path.join(destination_folder, "tokenizer.model"))

def create_config_file(folder_path, content):
    with open(os.path.join(folder_path, "config.json"), "w") as config_file:
        json.dump(content, config_file, indent=4)

In [6]:
config_content = {
    "_name_or_path": "./names_1m",
    "architectures": [
        "LlamaForCausalLM"
    ],
    "bos_token_id": 2,
    "eos_token_id": 3,
    "hidden_act": "silu",
    "hidden_size": 64,
    "initializer_range": 0.02,
    "intermediate_size": 180,
    "max_position_embeddings": 32,
    "model_type": "llama",
    "num_attention_heads": 16,
    "num_hidden_layers": 8,
    "num_key_value_heads": 16,
    "pad_token_id": 1,
    "pretraining_tp": 1,
    "rms_norm_eps": 1e-06,
    "rope_scaling": None,
    "tie_word_embeddings": False,
    "torch_dtype": "float32",
    "transformers_version": "4.28.1",
    "use_cache": False,
    "vocab_size": 97
}


out_folder_path = "bookAndGenre"
os.makedirs(out_folder_path, exist_ok=True)
create_config_file(out_folder_path, config_content)
train_tokenizer('bookData.csv', 'tokenizer')
move_tokenizer_to_folder("tokenizer.model", out_folder_path)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: bookData.csv
  input_format: 
  model_prefix: tokenizer
  model_type: BPE
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differen

In [7]:
tokenizer = LlamaTokenizer.from_pretrained(out_folder_path)
tokenizer.pad_token = tokenizer.eos_token

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


# Setting up the model with config
* Function created to return the model with required config

In [100]:
def create_config_model(path):
    config = LlamaConfig.from_pretrained(path)

    model = LlamaForCausalLM(config)

    if torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    model_size = sum(t.numel() for t in model.parameters())

    print(f"GPT Model size: {model_size/1000**2:.1f}M parameters")
    
    return model



# Setting up training the model
* Function created to setup training for the model

In [104]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

def train_model(model, tokenizer, train_dataset, test_dataset, out_folder_path):
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=out_folder_path,
        overwrite_output_dir=True,
        num_train_epochs=10,  # Reduced number of epochs
        per_device_train_batch_size=32,  # Increased batch size
        save_steps=1000,
        logging_steps=100,  # Reduced logging frequency
        eval_steps=500,
        logging_dir=f'{out_folder_path}/logs',
        evaluation_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.001)]
    )

    trainer.train()
    model.save_pretrained(out_folder_path)


In [105]:
def create_tokenized_dataset_splits(path, tokenizer, block_size):
    dataset = load_dataset('text', data_files=path)
    shuffled_dataset = dataset['train'].shuffle(seed=5).select(range(1000))
    split_datasets = shuffled_dataset.train_test_split(test_size=0.05)

    def tokenize_dataset(dataset):
        return dataset.map(
            lambda examples: tokenizer(
                examples['text'], truncation=True,
                padding='max_length', max_length=block_size
            ),
            batched=True
        )

    return tokenize_dataset(split_datasets['train']), tokenize_dataset(split_datasets['test'])

In [106]:
model = create_config_model(out_folder_path)
train_dataset, test_dataset = create_tokenized_dataset_splits('bookData.csv', tokenizer, block_size=64)
train_model(model, tokenizer, train_dataset, test_dataset, out_folder_path)

GPT Model size: 0.4M parameters


  0%|          | 0/11900 [08:23<?, ?it/s]
 33%|███▎      | 99/300 [00:09<00:18, 11.15it/s]
 34%|███▎      | 101/300 [00:09<00:17, 11.06it/s]   

{'loss': 1.0559, 'grad_norm': 1.0988425016403198, 'learning_rate': 3.3333333333333335e-05, 'epoch': 3.33}


 66%|██████▋   | 199/300 [00:18<00:09, 11.14it/s]
 67%|██████▋   | 201/300 [00:18<00:08, 11.07it/s]   

{'loss': 0.9983, 'grad_norm': 1.2198072671890259, 'learning_rate': 1.6666666666666667e-05, 'epoch': 6.67}


100%|█████████▉| 299/300 [00:27<00:00, 11.20it/s]
100%|██████████| 300/300 [00:27<00:00, 11.20it/s]   
100%|██████████| 300/300 [00:27<00:00, 10.89it/s]   

{'loss': 0.9705, 'grad_norm': 1.6474456787109375, 'learning_rate': 0.0, 'epoch': 10.0}
{'train_runtime': 27.5542, 'train_samples_per_second': 344.775, 'train_steps_per_second': 10.888, 'train_loss': 1.0082311503092447, 'epoch': 10.0}





In [110]:
def generateStory(model, tokenizer, prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    attention_mask = torch.ones_like(input_ids).to(model.device)

    with torch.no_grad():
        output = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=150,
                early_stopping=True,
                temperature=0.6,
                top_p=0.8,
                top_k=50,
                do_sample=True,
                output_scores=True,
                pad_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.4,
                eos_token_id=tokenizer.eos_token_id
            )
        output_str = tokenizer.decode(output[0], skip_special_tokens=True).split(".")[0]
        print(output_str)

# Output 

In [112]:
model.eval()
generateStory(model, tokenizer, "fantasy")

fantasy to he you you the T a to and as that on the of you it of the the the for of to the to the was T in you was on be a in and as he that was ofen and he he in asur the to it the and on that he in of that ofing and the of he you "" the the the T and he be to with "" to the with youhely you and as the the heit to for of the the for it and of a ofis I and ofhe in I his the "" his you the and he that the his he and and ofhe heing re the he to theot of of it toot in forotly and the was he
