# $\text{Book Chat Bot}$

Fine Tuned Large Language Model like GPT-2 and Llama-3.1 on any book

Author: Ashish Kumar Uchadiya

Contact: akuresonite@gmail.com

In [1]:
%%capture
# !pip install transformers
# !pip install accelerate -U
# !pip install transformers[torch]
# !pip install torch
# !pip install -U PyPDF2
# !pip install python-docx

In [2]:
%load_ext watermark
# -----------------------------------------------------------------------------------------------------------
import os
import requests
import re
from PyPDF2 import PdfReader
import docx
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from colorama import Fore, Style
# -----------------------------------------------------------------------------------------------------------

def clr(text):
    return f"{Fore.YELLOW}{text}{Style.RESET_ALL}"
def get_cuda_cores():
    device = torch.cuda.current_device()
    compute_capability = torch.cuda.get_device_capability(device)
    cores_per_sm = {2: 32, 3: 192, 5: 128, 6: 64, 7: 64, 8: 64}  # cores per streaming multiprocessor
    sm_count = torch.cuda.get_device_properties(device).multi_processor_count
    cores = sm_count * cores_per_sm[compute_capability[0]]
    return cores
def set_seed(seed: int = 42) -> None:
    import random, numpy
    numpy.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as: {seed}")

device = "cuda:1" if torch.cuda.is_available() else 'cpu'

print("device:", clr(device))
print("Cpu cores found:", clr(os.cpu_count()))
try:
    print("CUDA device:", clr(torch.cuda.get_device_name(device=device)))
    print(f"CUDA cores found: {clr(get_cuda_cores())}")
except:
    pass

from watermark import watermark
print(watermark())
print(watermark(packages="torch,transformers"))
print(torch.__version__)
set_seed(42)

device: [33mcuda:1[0m
Cpu cores found: [33m64[0m
CUDA device: [33mNVIDIA RTX A5000[0m
CUDA cores found: [33m4096[0m
Last updated: 2024-08-16T22:20:57.456397+05:30

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.25.0

Compiler    : GCC 11.2.0
OS          : Linux
Release     : 6.1.0-12-amd64
Machine     : x86_64
Processor   : 
CPU cores   : 64
Architecture: 64bit

torch       : 2.3.1
transformers: 4.43.2

2.3.1
Random seed set as: 42


////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [3]:
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
        elif filename.endswith(".txt"):
            combined_text += read_txt(file_path)
    return combined_text

In [4]:
# %%capture
# GPT2Tokenizer.from_pretrained('openai-community/gpt2')
# GPT2LMHeadModel.from_pretrained('openai-community/gpt2')

# GPT2Tokenizer.from_pretrained('openai-community/gpt2-medium')
# GPT2LMHeadModel.from_pretrained('openai-community/gpt2-medium')

# GPT2Tokenizer.from_pretrained('openai-community/gpt2-large')
# GPT2LMHeadModel.from_pretrained('openai-community/gpt2-large')

# GPT2Tokenizer.from_pretrained('openai-community/gpt2-xl')
# GPT2LMHeadModel.from_pretrained('openai-community/gpt2-xl')

In [5]:
!du "/home/23m1521/.cache/huggingface/hub/" -sh $(ls -A) | sort -h

20K	logs
24K	_2_book_chat_bot.ipynb
28K	helper_functions.py
32K	_1_GPT2_FT.ipynb
1.5M	data
3.3G	finetuned
9.6G	/home/23m1521/.cache/huggingface/hub/


In [15]:
def train_chatbot(directory, model_name, train_fraction=0.8):
    
    model_output_path = os.path.join('finetuned', model_name)
    os.makedirs(model_output_path, exist_ok=True)
   
    # combined_text = read_documents_from_directory(directory)
    combined_text = read_txt(directory)
    combined_text = re.sub(r'\n+', '\n', combined_text).strip()  # Remove excess newline characters

 
    split_index = int(train_fraction * len(combined_text))
    train_text = combined_text[:split_index]
    val_text = combined_text[split_index:]
    
    print(f"combined_text:{len(combined_text)}, train:{len(train_text)}, val:{len(val_text)}")

    os.makedirs('data', exist_ok=True)
    with open("data/train.txt", "w") as f:
        f.write(train_text)
    with open("data/val.txt", "w") as f:
        f.write(val_text)


    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    # Load model directly
    # from transformers import AutoTokenizer, AutoModelForCausalLM

    # tokenizer = AutoTokenizer.from_pretrained(model_name)
    # model = AutoModelForCausalLM.from_pretrained(model_name)


    train_dataset = TextDataset(tokenizer=tokenizer, file_path="data/train.txt", block_size=128)
    val_dataset = TextDataset(tokenizer=tokenizer, file_path="data/val.txt", block_size=128)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

 
    training_args = TrainingArguments(
        output_dir=model_output_path,
        overwrite_output_dir=True,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()

    trainer.save_model(model_output_path)
    tokenizer.save_pretrained(model_output_path)

In [16]:
def generate_response(model, tokenizer, prompt, max_length=100):
    
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones_like(input_ids) # Create the attention mask and pad token id
    pad_token_id = tokenizer.eos_token_id

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        attention_mask=attention_mask,
        pad_token_id=pad_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

def print_response(response):
    for i, word in enumerate(response.split(" ")):
        if (i > 0) and (i%10 == 0):
            print()
        print(word.strip(), end=' ')
    print()

## Get Book From https://www.gutenberg.org/

In [17]:
def download_txt_file(url, save_path):
    if os.path.exists(save_path):
        print(f"File '{save_path}' already exists. Skipping download.")
        return
    
    try:
        response = requests.get(url)
        response.raise_for_status()

        with open(save_path, 'w', encoding='utf-8') as file:
            file.write(response.text)
        
        print(f"File downloaded and saved as '{save_path}'")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")

In [18]:
dataset_dir = '/home/23m1521/datasets/text_data'

url = "https://www.gutenberg.org/ebooks/345.txt.utf-8"
save_path = os.path.join(dataset_dir, "dracula.txt")

download_txt_file(url, save_path)

File '/home/23m1521/datasets/text_data/dracula.txt' already exists. Skipping download.


## Training

models = ['openai-community/gpt2','openai-community/gpt2-medium', 'openai-community/gpt2-large','openai-community/gpt2-xl']

In [19]:
!rm data logs finetuned -rf

In [20]:
MODEL_NAME = 'openai-community/gpt2'
model_output_path = os.path.join('finetuned', MODEL_NAME)
# filepath = '/home/23m1521/datasets/text_data/Movies_plot_txt/The_Avengers.txt'
filepath = save_path

In [12]:
%%time

train_chatbot(filepath, MODEL_NAME)

!du -sh $(ls -A) | sort -h

combined_text:862701, train:690160, val:172541




Step,Training Loss
500,3.2587
1000,2.7721
1500,2.411
2000,2.064
2500,1.7359
3000,1.4601
3500,1.2216
4000,1.0179
4500,0.8522
5000,0.7188




20K	logs
24K	_2_book_chat_bot.ipynb
28K	helper_functions.py
32K	_1_GPT2_FT.ipynb
1.5M	data
3.3G	finetuned
CPU times: user 31min 24s, sys: 46.9 s, total: 32min 11s
Wall time: 27min 36s


## Loading the Fine-Tuned model and tokenizer

In [22]:
model = GPT2LMHeadModel.from_pretrained(model_output_path).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)

## Testing

In [31]:
prompts = [
    'Who was Dracula'
    ]

for prompt in prompts:
    response = generate_response(model, tokenizer, prompt)
    print_response(response)
    print("-"*37)

Who was Dracula? Count Dracula is an ancient vampire and Transylvanian nobleman who 
feeds on the blood of the living to maintain his 
immortality. He possesses supernatural abilities such as shapeshifting, mind control, 
and enhanced strength. Dracula resides in a decaying castle in 
the Carpathian Mountains and plans to expand his vampiric influence 
by relocating to England. The novel's protagonist, Jonathan Harker, first 
encounters Dracula when he travels to Transylvania to assist him 
with a real estate transaction. As Dracula begins to prey 
on victims in England, a group led by Professor Van 
Helsing bands together to hunt him down. Dracula represents the 
terror of the unknown and the clash between modern civilization 
and ancient evil. 
-------------------------------------


## Traning Llama 3.1

In [None]:
MODEL_NAME = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
model_output_path = os.path.join('finetuned', MODEL_NAME)
filepath = save_path

In [None]:
%%time

train_chatbot(filepath, MODEL_NAME)

!du -sh $(ls -A) | sort -h

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")

In [None]:
prompts = [
    'Who was Dracula'
    ]

for prompt in prompts:
    response = generate_response(model, tokenizer, prompt)
    print_response(response)
    print("-"*37)