##### Installing dependencies

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages, good for effecient and fast finetuning of large models (optional)
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
!pip install langchain
!pip install langchain-community
!pip install pypdf
!pip install fitz
!pip install pymupdf
!pip install unstructured python-magic
!pip install faiss-gpu
!pip install transformers torch huggingface_hub
!pip install python-dotenv==1.0.0 streamlit==1.22.0 tiktoken==0.4.0
!pip install protobuf~=3.20
!pip install sentence-transformers
!pip install rich

##### Necessary imports

In [None]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from datasets import load_dataset

In [None]:
import logging
import numpy as np
import faiss
import os
import pandas as pd
import magic
import os
import nltk
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import DirectoryLoader
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain import PromptTemplate
from google.colab import userdata
from transformers import TextStreamer
from rich.console import Console
from rich.markdown import Markdown
from rich.table import Table
from rich.panel import Panel
from rich.text import Text


##### Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

##### Base model loading in quantized in 4-bit

In [None]:
max_seq_length =  4096 #You can set this upto {8192- (max number of output tokens you want)}
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

Initializing Model for Low Rank Adaptation

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

In [None]:
#@title #####System Prompt - To use later for RAG with the finetuned LLM
alpaca_prompt = """Below is a query asked by a user , paired with additonal data that provides further context. Write a response that appropriately answers the query.
You must answer only from the given data and not make up anything. Be as detailed as possible. If the answer is not present in the data, just say "I don't know".

### Query:
{}

### Data:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

In [None]:
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN #Adding EOS token as special token to vocabulary
        texts.append(text)
    return { "text" : texts, }
pass

In [None]:
#@title #####Loading the dataset (can be any of your choice but I chose a standard one)
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
dataset

##### Initializing training parameters using LoRA

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,  # 1 epoch is about 6000 steps for alpaca_cleaned dataset (50k examples)
        # num_train_epochs = 6,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
#@title #####Memory statistics
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

##### LoRA enabled Fine-Tuning



In [None]:
trainer_stats = trainer.train()

In [None]:
#@title #####Final memory and time statistics after training
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
#@title ##### Test Inference

# alpaca_prompt = from above
FastLanguageModel.for_inference(model) #Testing
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Say the opposite of the data porvided.", # query
        "Hello", # data
        "", # output - leave this blank
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128) # As Llama3_8b has a total context window of 8k tokens , you can adjust max_new_tokens with the upper limit of 8000-input tokens

##### Save the Model (for effeciency this just saves the LoRA adapters)

In [None]:
model.save_pretrained("/content/drive/MyDrive/aries_llama3_8b/aries_model") # Local saving
tokenizer.save_pretrained("/content/drive/MyDrive/aries_llama3_8b/aries_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

##### Load the Model from whatever path you saved it to

In [None]:
max_seq_length = 4096 #You can set this upto {8000- (max number of output tokens you want)} to be under the context window limit
dtype = None
load_in_4bit = True

if True:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/aries_llama3_8b/aries_model", # Loading the model using the saved path (here I saved in my drive but you can upload too)
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit, # Again, loading in 4-bit to preserve memory and enhance speed

    )
    FastLanguageModel.for_inference(model) #Initialize it for inference

In [None]:
#@title #####System Prompt
alpaca_prompt = """Below is a query asked by a user , paired with additonal data that provides further context. Write a response that appropriately answers the query.
You must answer only from the given data and not make up anything. Be as detailed as possible. If the answer is not present in the data, just say "I don't know".

### Query:
{}

### Data:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token

In [None]:
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN #Adding EOS token as special token to vocabulary
        texts.append(text)
    return { "text" : texts, }
pass

##### Document Loading and preprocessing

In [None]:
documents = []
file_path = '/content/drive/MyDrive/mml-book.pdf' #The path to the document you need QA facilities for
loader = PyMuPDFLoader(file_path)
documents = loader.load()

# You can use this code if you have a full directory of pdfs instead of a single one

# pdf_directory = "/content/drive/MyDrive/"
# documents = []
# for root, dirs, files in os.walk(pdf_directory):
#     for file in files:
#         if file.endswith(".pdf"):
#             file_path = os.path.join(root, file)
#             loader = PyMuPDFLoader(file_path)
#             documents.extend(loader.load())

In [None]:
# Print number of documents loaded
print(f"Number of documents loaded: {len(documents)}")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500)

# Split the documents into chunks for ease of search and incorporation into RAG later
split_documents = text_splitter.split_documents(documents)
print(f"Number of chunks: {len(split_documents)}")

In [None]:
#@title ##### Embedding function to convert the text into vectors

#You can optionally specify model name or even use a different embedding function, it works the same way
embeddings = HuggingFaceEmbeddings()

##### Creation of vector database to store the embeddings and enable similarity search

In [None]:
save_directory_faiss = "/content/drive/MyDrive/FAISS_aries"

In [None]:
#@title ##### Create and save the database of you embedded document chunks

vector_db_faiss = FAISS.from_documents(documents = split_documents, embedding = embeddings)
vector_db_faiss.save_local(save_directory_faiss) #You can save your vector database to the apth specified above

In [None]:
#@title ##### Or load it if you already have saved one and do not wish to create again
vector_db_faiss = FAISS.load_local(save_directory_faiss, embeddings, allow_dangerous_deserialization=True)

##### Searching the vector database from a user query (for demonstration and insight)

In [None]:
user_input = input("Please enter your query: ")

In [None]:
# similarity search with score returns a list of bith the chunks and their corresponding score

docs = vector_db_faiss.similarity_search_with_score(user_input, k = 6) # k is the number of similar chunks you want to fetch to pass into your model , do not keep it too high or you will exceed your context window limit

In [None]:
#docs

In [None]:
# Extracting scores and documents separately (documents also contain metadata which I have extracted separartely later)
similar_documents = []
scores = []
for doc, score in docs:
  similar_documents.append(doc)
  scores.append(score)

In [None]:
# similar_documents

In [None]:
# Extract page contents and metadata separately
page_contents = [doc.page_content for doc in similar_documents]
metadata = [doc.metadata for doc in similar_documents]

print("Page Contents:")
for i, content in enumerate(page_contents):
    print(f"Document {i+1} Content:\n{content}\n")

print("Metadata:")
for i, meta in enumerate(metadata):
    print(f"Document {i+1} Metadata:\n{meta}\n")

In [None]:
# page_contents

##### Inference and response by the model

In [None]:
def extract_response(text):
    response_marker = "### Response:"
    start_index = text.find(response_marker)

    if start_index == -1:
        return "Response marker not found"

    # Move the start_index to the position right after the marker
    start_index += len(response_marker)

    # Extract the response text
    response_text = text[start_index:].strip()

    return response_text

# Function to save the conversation log to a text file
def save_conversation_log(conversation_log, filename="conversation_log.txt"):
    with open(filename, "w") as file:
        for entry in conversation_log:
            file.write(entry + "\n")
            file.write("\n" + "-"*80 + "\n")  # Add a separator between entries

In [None]:
# Initialize conversation log and rich console
conversation_log = []
console = Console()

console.print("[bold green]Welcome! Type your query below. Type 'exit' to stop the conversation.[/bold green]")

while True:
    user_input = input("Please enter your query: ")

    # Check if the user wants to exit the loop
    if user_input.lower() == "exit":
        break

    # Add user query to the conversation log
    conversation_log.append(f"User: {user_input}")

    # Perform similarity search with score
    docs = vector_db_faiss.similarity_search_with_score(user_input, k=6)  # Adjust k as needed

    # Extracting scores and documents separately
    similar_documents = []
    scores = []
    for doc, score in docs:
        similar_documents.append(doc)
        scores.append(score)

    # Extract page contents and metadata separately
    page_contents = [doc.page_content for doc in similar_documents]
    metadata = [doc.metadata for doc in similar_documents]

    # Display the page contents, metadata, and scores using rich Table
    table = Table(title="Retrieved Documents")
    table.add_column("Page Content", justify="left", style="cyan", no_wrap=False)
    table.add_column("Metadata", justify="left", style="magenta")
    table.add_column("Score", justify="left", style="green")

    for content, meta, score in zip(page_contents, metadata, scores):
        table.add_row(content, str(meta), str(score))

    console.print(table)

    # Prepare inputs for the model
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                user_input,  # query
                page_contents,  # data
                "",  # output - leave this blank
            )
        ],
        return_tensors="pt"
    ).to("cuda")

    # Generate response
    # text_streamer = TextStreamer(tokenizer)
    # outputs = model.generate(**inputs, streamer=text_streamer, max_new_tokens=4096)
    # response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]


    # Generate response
    outputs = model.generate(**inputs, max_new_tokens=4096, pad_token_id=tokenizer.eos_token_id, use_cache=True)
    # Extract response
    response = extract_response(tokenizer.decode(outputs[0], skip_special_tokens=True))

    # Add model response to the conversation log
    conversation_log.append(f"Model: {response}")

    # Display the user's query and the model's final response using rich Panel and Text
    query_text = Text(f"Query:\n\n{user_input}", justify="left")
    response_text = Text(f"Response:\n\n{response}", justify="left")

    console.print(Panel(query_text, title="User Query", subtitle_align="left"))
    console.print(Panel(response_text, title="Model Response", subtitle_align="left"))

    # Log retrieved documents, metadata, and scores neatly
    conversation_log.append("Retrieved Documents:\n")
    for i, (content, meta, score) in enumerate(zip(page_contents, metadata, scores)):
        conversation_log.append(f"Document {i+1} Metadata:\n{meta}")
        conversation_log.append(f"Document {i+1} Content:\n{content}")
        conversation_log.append(f"Document {i+1} Score:\n{score}")
        conversation_log.append("\n" + "-"*80 + "\n")

# Save the conversation log to a text file
save_conversation_log(conversation_log)
console.print("[bold green]Conversation saved to conversation_log.txt[/bold green]")
