# Getting Knowledge-Base

In [None]:
import fitz # it's pymupdf library
from tqdm.auto import tqdm


pdf_path = "/home/ai/TAC2-lbz/MES5448_MES7048_user_manual_8_4_0_7_en.pdf"

# by this function our text would be cleaner and better for our LLM
def text_formatter(text:str) -> str:
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

# now for opening the pdf and reading it we want this function:
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4, # 1 token is about 4 words. this will be need for passing to LLM
                                "text": text })
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
# pages_and_texts[100]

In [None]:
# create dataFrames from our pages and texts
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
df.describe().round(2)

# Data PreProcess

token count is important because we can't use embedding models with infinite tokens, and also LLMs. so fo choosing best embedding model and best LLM we have to know token count.

In [None]:
from spacy.lang.en import English

nlp = English()

nlp.add_pipe("sentencizer")

for item in tqdm(pages_and_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
import random 
random.sample(pages_and_texts, k=1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

Lets chunk our large sentences into smaller one

splitting sentences in group of 10 sentences 

it's called text splitting and libraries like **LangChain** can do this for us

Goal of doing this is to be more easier to filter our sentences and also much easier for our embedding model.

In [None]:
# Defining the group size
num_sentences_chunk_size = 10

def split_list(input_list:list[str], slice_size:int=num_sentences_chunk_size) -> list[list[str]]:
    return [input_list[i: i + slice_size] for i in range(0, len(input_list), slice_size)]

In [None]:
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(item["sentences"])
    item["number_of_chunks"] = len(item["sentence_chunks"])

In [None]:
# lets see what we are doing :D

random.sample(pages_and_texts, k=1)

In [None]:
df= pd.DataFrame(pages_and_texts)
df.describe().round(2)

now we want to have each chunk as a dictionary item  not in as a list of sentence chunks

In [None]:
import re # re is a python library and stands for regex. regex also stands for regular expression XD

pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        # join sentences together into a paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # '.A' -> ', A'
        
        chunk_dict["sentence_chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in (joined_sentence_chunk.split(" "))])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1  token has 4 chars
        
        pages_and_chunks.append(chunk_dict)
        
len(pages_and_chunks) # to see how many pages and chunks we have 

In [None]:
random.sample(pages_and_chunks, 1)

In [None]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

lets filter the dataFrame for rows under the 30 tokens. because they are not much useful and they didn't help us

In [None]:
min_token_length = 30
pages_and_chunks_over_min_token_length = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_length[:2]

In [None]:
df = pd.DataFrame(pages_and_chunks_over_min_token_length)
df.describe().round(2)

# Embedding our text chunks

to see what is embeddings and why we are using: https://vickiboykis.com/what_are_embeddings/

all-mpnet-base-v2 model : https://huggingface.co/sentence-transformers/all-mpnet-base-v2

In [None]:
from sentence_transformers import SentenceTransformer

# this would be our embedding model:

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cpu")

In [None]:
embedding_model.to("cuda") # uding gpu for faster embedding 

for item in tqdm(pages_and_chunks_over_min_token_length):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

In [None]:
random.sample(pages_and_chunks_over_min_token_length, k=1) # see what we get

In [None]:
import numpy as np
np.shape(pages_and_chunks_over_min_token_length[100]["embedding"])

In [None]:
# saving embeddings to a file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_length)
embedding_df_path= "/home/ai/TAC2-lbz/text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embedding_df_path, index=False)

In [None]:
# import reading csv
text_chunks_and_embeddings_df_load = pd.read_csv(embedding_df_path)
text_chunks_and_embeddings_df_load.head()

# RAG   Retrieval-Augmented Generation

In [None]:
import random
import numpy as np
import pandas as pd
import torch

device = "cuda" if torch.cuda.is_available else "cpu" # if gpu is available we choose it and if not we chose cpu as our device

# importing text and embeddings
text_chunks_and_embeddings_df = pd.read_csv("/home/ai/TAC2-lbz/text_chunks_and_embeddings_df.csv")

# now converting embedding column to a np.array
text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# converting embedding into a torch.tensor
embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device=device)

# converting text and embeddings to the list of dictionaries
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")

text_chunks_and_embeddings_df # to see what i just created

In [None]:
embeddings.shape # just seeing what i have created :)

In [None]:
# create model

from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device) # embedding our query with same model as we embedded our knowledge-base


In [None]:
# Defining the query
query = "What is the firmware version synchronized with Version 4.0 of the MES5448 and MES7048 operation manual?"
print(f"query: {query}")

# embed the query
query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cuda")

# similarity scores
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]

# Getting top-k results
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

In [None]:
pages_and_chunks[382]

# LLM 

In [None]:
!nvidia-smi # to check how much gpu memory is available for choosing the model

LLM model that i chose : Gemma-7b-it https://huggingface.co/google/gemma-7b-it

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 

# quantization config 
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                        bnb_4bit_compute_dtype=torch.float16)


if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")


model_id = "google/gemma-7b-it" 

#tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id)


[INFO] Using attention implementation: sdpa


Loading checkpoint shards: 100%|██████████| 4/4 [00:56<00:00, 14.11s/it]


In [7]:
!nvidia-smi

Sat Aug 17 08:28:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:01:00.0 Off |                  N/A |
|  0%   32C    P8              22W / 370W |  24236MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=3072, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear(in_features=24576, out_features=3072, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((3072,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((3072,), eps=1

In [9]:
input_text = "give me a description about MES5448"

# prompt

template = [
    {
    "role": "user",
    "content": input_text}
]

prompt= tokenizer.apply_chat_template(conversation=template, tokenize=False, add_generation_prompt=True)
print(f"prompt: \n{prompt}")

prompt: 
<bos><start_of_turn>user
give me a description about MES5448<end_of_turn>
<start_of_turn>model



In [10]:
input_ids = tokenizer(input_text, return_tensors="pt").to("cpu") # failed when using gpu

output = llm_model.generate(**input_ids, max_new_tokens=256)



tensor([[     2,  20346,    682,    476,   5966,   1105,  69841, 235308, 235310,
         235310, 235321, 235280,    578,  69841, 235308, 235310, 235310, 235321,
         235305, 235265,    109,    688,  41018, 235308, 235310, 235310, 235321,
         235280,    688,    109,    651,  69841, 235308, 235310, 235310, 235321,
         235280,    603,    476,   3178, 235290,   7511, 235269,   3178, 235290,
          38943, 235269,   3821, 235290,  29659, 205940,    675,  17295, 182939,
           1582,    685, 108657, 235269,  53751, 235269,    578,  12345,  41742,
         235265,   1165,   5119,    476, 235248, 235274, 235318, 235290,   2428,
          45000, 235288,  22638, 235269,    476,  26168,  16333,   6884,   1812,
         235269,    578,    476,   5396,   3001,    576, 182939, 235265,    714,
          69841, 235308, 235310, 235310, 235321, 235280,    603,   1578, 235290,
         154199,    604,    476,   8080,    576,   3178, 235290,   7511,  32982,
           8557, 235269,   3

In [None]:
text_output = tokenizer.decode(output[0])

text_output