<a href="https://colab.research.google.com/github/aswinaus/LLM_Inference/blob/main/mistral_quant_awq_load_local_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/huggingface/transformers torch accelerate langchain langchain_huggingface datasets autoawq --quiet

In [None]:
!pip install langchain_openai langchain_community chromadb tiktoken --quiet

Code is essentially forcing Python to always use "UTF-8" as the preferred encoding, regardless of the user's actual system settings. UTF-8 is a widely used encoding that can represent a vast range of characters from different languages. By enforcing UTF-8, you can help ensure that your code works consistently across different platforms and avoids encoding-related errors. It's a common practice for improving compatibility and preventing issues with text handling in Python programs.

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
from google.colab import userdata
HUGGING_FACE_TOKEN = userdata.get('HUGGING_FACE_TOKEN')

In [None]:
!huggingface-cli login --token $HUGGING_FACE_TOKEN

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Download Data
data_dir = '/content/drive/MyDrive'

In [None]:
# Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
import torch
from langchain_huggingface import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from threading import Thread

The nvidia-smi command is a utility provided by NVIDIA to query and display information about your NVIDIA GPU(s) (Graphics Processing Unit). This includes things like:

GPU model and name
Driver version
GPU utilization
Memory usage
Temperature
Power consumption
Processes running on the GPU

In [None]:
!nvidia-smi

In [None]:
import textwrap

def wrap_text(text, width=90): #preserve_newlines
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
from typing import Tuple, Optional, Union, Dict, Any
from transformers import PreTrainedModel, AutoModel, AutoTokenizer, AutoConfig
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from google.colab import drive

In [None]:
drive.mount('/content/drive')
data_dir = '/content/drive/MyDrive' # Input a data dir path from your mounted Google Drive

In [None]:
quant_path = f"/{data_dir}/LLMs/Mistral/Mistral-Small-24B-Instruct-2501"

In [None]:
from transformers import MistralForCausalLM, AutoTokenizer
local_model_path = quant_path
local_tokenizer = AutoTokenizer.from_pretrained(quant_path)
local_model = MistralForCausalLM.from_pretrained(quant_path,low_cpu_mem_usage=True)

local_model.to(device) moves all the model's parameters and buffers to the specified device (in this case, device, which is set to 'cuda' if a GPU is available). Deep learning models often have a large number of parameters and require significant computational power. GPUs are designed for parallel processing and can significantly speed up the training and inference of deep learning models. By moving the model to the GPU, you leverage its computational capabilities for faster execution.

In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
local_model.to(device)

In [None]:
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("aswinaus/tax_statistics_dataset_by_income_range", download_mode="force_redownload")
df=pd.DataFrame(dataset['train'])
df.head(10)

RAG pipeline implementation

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from langchain_core.runnables import (
    RunnableParallel,
    RunnablePassthrough
)
from langchain.schema.output_parser import StrOutputParser
from typing import Dict, Any, List
from langchain.docstore.document import Document

In [None]:
from getpass import getpass
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [None]:
print(dataset)

In [None]:
#RecursiveCharacterTextSplitter for splitting the documents into chunk size and overlapp for efficient meaningful chunks
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50,
)

In [None]:
#define a pad token for tokenizer and save it.
#local_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#local_model.resize_token_embeddings(len(local_tokenizer))
#local_tokenizer.save_pretrained(quant_path)
#local_model = AutoModel.from_pretrained(quant_path,low_cpu_mem_usage=True, trust_remote_code=True)
#local_model.to(device)

In [None]:
# Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers # This line imports the transformers module.
import torch
from langchain_huggingface import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from threading import Thread

Code creates a text generation pipeline that utilizes a specified local pre-trained model and tokenizer to generate text, with parameters controlling the randomness, repetition, length, and format of the generated output.

In [None]:
text_generation_pipeline = transformers.pipeline(
    model=local_model,#line specifies the pre-trained language model to be used for text generation. The local_model variable holds a local trained model object that was previously loaded.
    tokenizer=local_tokenizer,#specifies the tokenizer to be used. A tokenizer is responsible for breaking down the input text into individual tokens (words or subwords) that the model can understand. local_tokenizer likely holds a tokenizer object that was previously loaded and corresponds to the chosen local_model.
    task="text-generation",  # Specify the task as text generation
    temperature=0.3,  # Temperature parameter for controlling randomness in sampling
    repetition_penalty=1.1,  # Repetition penalty parameter to avoid repeating tokens
    return_full_text=True,  # Flag to return full text instead of a list of generated tokens
    max_new_tokens=1000,  # Maximum number of tokens to generate
    do_sample=True  # Flag to use sampling during text generation
)

prompt_template = """
### [INST]
Instruction: I will ask you a QUESTION and give you a CONTEXT and you will respond with an answer easily understandable.

### CONTEXT:
{context}

### QUESTION:
{question}

[/INST]
 """

#RAG Pipline

# Create HuggingFacePipeline object wrapping the text generation pipeline
Huggingface_mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create prompt object from the prompt template with input variables as context and question
prompt = PromptTemplate(
    input_variables=["context", "question"],  # Specify input variables for the prompt
    template=prompt_template,  # Specify the template for the prompt
)

# Create language model chain (llm_chain) with HuggingFacePipeline and prompt
llm_chain = prompt | Huggingface_mistral_llm | StrOutputParser()

In [None]:

llm_prompt_base = """
### [INST]
Instruction: You are a tax assistant


### QUESTION:
What is Tax Form 990?

[/INST]
 """

In [None]:
response=text_generation_pipeline(llm_prompt_base)
#r=response[0].get('generated_text').split("[/INST]")[1].split('')[0]
#print(r.strip())

In [None]:
response

In [None]:
import pandas as pd

# Assume 'df' is your Pandas DataFrame
train_ratio = 0.8  # Proportion of data for training
validation_ratio = 0.1  # Proportion of data for validation
test_ratio = 0.1  # Proportion of data for testing

# Shuffle the DataFrame (optional but recommended)
df = df.sample(frac=1, random_state=42)

# Calculate split indices
train_index = int(train_ratio * len(df))
validation_index = int((train_ratio + validation_ratio) * len(df))

# Split the DataFrame
train_df = df[:train_index]
validation_df = df[train_index:validation_index]
test_df = df[validation_index:]

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(validation_df)}")
print(f"Testing set size: {len(test_df)}")

In [None]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions

database_path = f"{data_dir}/RAG/VectorDB/chroma_db_RAG_quantnew"


# Delete the existing database
database_path = f"{data_dir}/RAG/VectorDB/chroma_db_RAG_quantnew"  # Update with your actual database path
if os.path.exists(database_path):
    shutil.rmtree(database_path)

# Create the database directory
os.makedirs(database_path, exist_ok=True)

# Initialize Chroma client
client = chromadb.Client()

# Define embedding function (choose one or configure your own)
# Default Sentence Transformers embedding function
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction()

# Create a collection (database table)
collection_name = "my_table_collection"
collection = client.get_or_create_collection(name=collection_name, embedding_function=sentence_transformer_ef)

# Chunking and storing data
chunk_size = 1 # Process one row at a time
for i in range(0, len(df), chunk_size):
    chunk = df.iloc[i:i + chunk_size]
    chunk_text = chunk.to_string() # Convert chunk to string
    ids = [f"row_{i+j}" for j in range(len(chunk))] # Create unique ids for each row
    collection.add(
        documents= [chunk_text],
        ids=ids
    )

#Example query
results = collection.query(
    query_texts=["value 2"],
    n_results=1
)
print(results)


In [None]:
from typing import List
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
import shutil
class MyEmbedding:
    def __init__(self, model):
        self.model = SentenceTransformer(model, trust_remote_code=True)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.model.encode(text).tolist() for text in texts]

    def embed_query(self, query: str) -> List[float]:
        encoded_query = self.model.encode(query)
        return encoded_query.tolist()

database_path = f"{data_dir}/RAG/VectorDB/chroma_db_RAG_quantnew"


# Delete the existing database
database_path = f"{data_dir}/RAG/VectorDB/chroma_db_RAG_quantnew"  # Update with your actual database path
if os.path.exists(database_path):
    shutil.rmtree(database_path)

# Create the database directory
os.makedirs(database_path, exist_ok=True)

def generate_embeddings(data):
    embeddings = []
    for text in df.head(10):
        text_splitter =  RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=10)
        chunks = text_splitter.create_documents(text)

        # Get embeddings for each chunk using local_model
        chunk_embeddings = []
        for chunk in chunks:
            encoded_input = local_tokenizer(chunk.page_content, return_tensors="pt").to(device)
            embedding = local_model(**encoded_input)[0].detach().cpu().numpy()
            chunk_embeddings.append(embedding)

        # Use an average or other aggregation method if necessary
        # For example, averaging the embeddings:
        # embedding = np.mean(chunk_embeddings, axis=0)

        chromadb = Chroma.from_documents(chunks,
                                 persist_directory=database_path,
                                 collection_name='coll_cosine',
                                 collection_metadata={"hnsw:space": "cosine"},
                                 embedding=MyEmbedding(model=local_model_path))

        chromadb.persist()
        retriever = chromadb.as_retriever()
    return embeddings  # Return a list of embeddings or any desired output


In [None]:
#Clean Unused Tensors
torch.cuda.empty_cache()
generate_embeddings(dataset)

In [None]:
# Input your question in string type and the relative path of your own local model.
def retrieve(user_query, model_path):
    embedding_model = MyEmbedding(model_path)

    chromadb = Chroma(embedding_function=embedding_model,
                      collection_name='coll_cosine',
                      collection_metadata={"hnsw:space": "cosine"},
                      persist_directory=database_path)

    results = chromadb.similarity_search_with_score(user_query, 10)
    print(results)
    return results[0][0].page_content

In [None]:
retrieve("For the State of AL can you get me the No of returns for Size of adjusted gross income $50,000 under $75,000",local_model_path)