<a href="https://colab.research.google.com/github/aswinaus/RAG/blob/main/RAG_DeepSeekR1_GGUF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/huggingface/transformers torch accelerate bitsandbytes langchain

Code is essentially forcing Python to always use "UTF-8" as the preferred encoding, regardless of the user's actual system settings. UTF-8 is a widely used encoding that can represent a vast range of characters from different languages. By enforcing UTF-8, you can help ensure that your code works consistently across different platforms and avoids encoding-related errors. It's a common practice for improving compatibility and preventing issues with text handling in Python programs.

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
#pyngrok is a Python wrapper for ngrok, a tool that allows you to expose a local web server to the public internet. This can be very useful for sharing your work, testing webhooks, or building demos that need to be accessible from the outside.
!pip install pyngrok --quiet
#This is a modern, fast (high-performance) web framework for building APIs with Python 3.6+ based on standard Python type hints. It's often used for creating web applications and services.
!pip install fastapi nest-asyncio --quiet
#uvicorn is an ASGI (Asynchronous Server Gateway Interface) web server. This essentially means that it's a tool that can run Python web applications designed for asynchronous operation and handling many requests concurrently. It's often used with modern Python web frameworks like FastAPI to serve the application to users.
!pip install uvicorn --quiet
!pip install langchain-community --quiet
!pip install -U langchain-huggingface --quiet

In [None]:
from google.colab import userdata
HUGGING_FACE_TOKEN = userdata.get('HUGGING_FACE_TOKEN')

In [None]:
!huggingface-cli login --token $HUGGING_FACE_TOKEN


In [None]:
# Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer , BitsAndBytesConfig
import transformers
import torch
from langchain_huggingface import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import PromptTemplate
from threading import Thread

device = 'cuda' if torch.cuda.is_available() else 'cpu'

The nvidia-smi command is a utility provided by NVIDIA to query and display information about your NVIDIA GPU(s) (Graphics Processing Unit). This includes things like:

GPU model and name
Driver version
GPU utilization
Memory usage
Temperature
Power consumption
Processes running on the GPU

In [None]:
!nvidia-smi

In [None]:
import textwrap

def wrap_text(text, width=90): #preserve_newlines
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

In [None]:
from typing import Tuple, Optional, Union, Dict, Any
from transformers import PreTrainedModel, AutoModel, AutoTokenizer, AutoConfig
from transformers.tokenization_utils_base import PreTrainedTokenizerBase

Function provides a robust way to load a pre-trained model, prioritizing quantization for optimization but gracefully falling back to a non-quantized version if necessary. This helps ensure compatibility and flexibility when working with different models and environments

In [None]:
def load_model_with_quantization_fallback(
    model_name: str = "/content/DeepSeek-R1-GGUF",
    trust_remote_code: bool = True,
    **kwargs
) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:

  try:
      config = AutoConfig.from_pretrained(
              model_name,
              trust_remote_code=trust_remote_code
          )
      config.model_type = "DeepSeek-R1"
      model = AutoModel.from_pretrained(
          model_name,
          config=config,
          model_type="DeepSeek-R1",
          trust_remote_code=trust_remote_code,
          device_map=device_map,
          **kwargs
      )
      tokenizer = AutoTokenizer.from_pretrained(model_name)
      print("Model loaded successfully with original configuration")
      return model, tokenizer
  except ValueError as e:
      if "Unknown quantization type" in str(e):
          print(
              "Quantization type not supported directly. "
              "Attempting to load without quantization..."
          )


          if hasattr(config, "quantization_config"):
              delattr(config, "quantization_config")

          try:
              model = AutoModel.from_pretrained(
                  model_name,
                  config=config,
                  model_type="DeepSeek-R1-GGUF",
                  trust_remote_code=trust_remote_code,
                  device_map=device_map,
                  **kwargs
              )
              tokenizer = AutoTokenizer.from_pretrained(
                  model_name,
                  trust_remote_code=trust_remote_code
              )
              print("Model loaded successfully without quantization")
              return model, tokenizer

          except Exception as inner_e:
              print(f"Failed to load model without quantization: {str(inner_e)}")
              raise
      else:
          print(f"Unexpected error during model loading: {str(e)}")
          raise

In [None]:
import torch
import os
# Determine the device to load the model on (CPU or GPU)
device_map = "cuda" if torch.cuda.is_available() else "cpu"

!pip install huggingface_hub hf_transfer --quiet
# import os # Optional for faster downloading
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

from huggingface_hub import snapshot_download
snapshot_download(
  repo_id = "unsloth/DeepSeek-R1-GGUF",
  local_dir = "DeepSeek-R1-GGUF",
  allow_patterns = ["*UD-IQ1_S*"], # Select quant type UD-IQ1_S for 1.58bit
)
#/content/DeepSeek-R1-GGUF
import torch
# Now you can call the function:
model, tokenizer = load_model_with_quantization_fallback()

In [None]:
!pip install datasets langchain langchain_community langchain_openai chromadb --quiet

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from langchain_core.runnables import (
    RunnableParallel,
    RunnablePassthrough
)
from langchain.schema.output_parser import StrOutputParser

In [None]:
from getpass import getpass
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [None]:
from datasets import load_dataset
import pandas as pd
dataset = load_dataset("aswinaus/tax_statistics_dataset_by_income_range", download_mode="force_redownload")
df=pd.DataFrame(dataset['train'])

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Download Data
data_dir = '/content/drive/MyDrive'

In [None]:
from typing import Dict, Any, List
from langchain.docstore.document import Document

In [None]:
# Convert DatasetDict to LangChain Documents
def create_langchain_documents(dataset: Dict[str, Any]) -> List[Document]:
    """Converts a Hugging Face DatasetDict to a list of LangChain Documents,
    including all columns as content.
    """
    documents = []
    for row in dataset['train']:  # Assuming 'train' is your split name
        # Concatenate all column values into a single string
        content = "\n".join([f"{k}: {v}" for k, v in row.items()])

        # Use all columns except 'content' as metadata
        metadata = row.copy()

        document = Document(page_content=content, metadata=metadata)
        documents.append(document)
    return documents

In [None]:
all_documents = create_langchain_documents(dataset)
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50,
)

In [None]:
pages = []
for document in all_documents:
  pages.extend(text_splitter.split_documents([document]))

In [None]:
# create vector store with Chroma
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata # import filter_complex_metadata

vectordb = Chroma.from_documents(documents=pages, embedding=OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"]),persist_directory=f"{data_dir}/RAG/VectorDB/chroma_db_RAG_Income_Tax")
vectordb.persist()
retriever = vectordb.as_retriever()

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
def encode_prompt(inputs: Dict[str, Any]) -> Dict[str, Any]:
    input_text = inputs["prompt"]
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    print(f"Input text: {input_text}")
    print(f"Input IDs: {input_ids}")
    # Instead of embedding here, return the input_ids directly
    #inputs["prompt"] = input_ids  # Replace text with tensor
    return input_ids # Return the tensor directly

In [None]:
!pip install langchain_huggingface --quiet

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer , BitsAndBytesConfig
import transformers

In [None]:
from langchain_core.runnables import RunnableSequence
# RAG
template = """You are an AI language model Accounting assistant.Answer the following question based on this context:
{context}
Question: {question}
"""
prompt_creator = ChatPromptTemplate.from_template(template) # moved template to prompt_creator


In [None]:
#Creating a RAG Pipeline
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableMap
from langchain_core.output_parsers import StrOutputParser

# RAG
template = """You are an AI language model Accounting assistant.Answer the following question based on this context:
{context}
Question: {question}
"""

prompt = prompt_creator
llm=model
final_rag_chain=RunnableSequence(
    RunnablePassthrough.assign(
        context=lambda x: format_docs(retriever.get_relevant_documents(x["question"])),
    )
    |{"context": retriever | format_docs, "question": RunnablePassthrough()}
    | RunnableMap({"prompt": encode_prompt})
    | model
    | StrOutputParser()
)

#prompt = ChatPromptTemplate.from_template(template)
#llm = model#ChatOpenAI(temperature=0, openai_api_key=os.environ["OPENAI_API_KEY"])



In [None]:
#Creating a RAG Pipeline
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableMap
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import HumanMessage
#from langchain.chains import RunnableSequence # Add this import

# RAG
template = """You are an AI language model Accounting assistant.Answer the following question based on this context:
{context}
Question: {question}
"""

prompt = prompt_creator
llm = model#ChatOpenAI(temperature=0, openai_api_key=os.environ["OPENAI_API_KEY"])
final_rag_chain = RunnableSequence(
    RunnablePassthrough.assign(
        context=lambda x: format_docs(retriever.get_relevant_documents(x["question"])),
    )
    | RunnablePassthrough.assign(
        prompt=lambda x: prompt_creator.format(context=x["context"], question=x["question"]) # format template string
    ) # This was missing before RunnableMap, leading to KeyError: 'prompt'

    | RunnablePassthrough.assign(debug_context=lambda x: print(f"Context before prompt: {x['context']}"))
    | RunnablePassthrough.assign(debug_question=lambda x: print(f"Question before prompt: {x['question']}"))

    | RunnablePassthrough.assign(debug_prompt=lambda x: print(f"Prompt after prompt: {x['prompt']}"))
    # Modified to input the prompt to encode_prompt and extract the tensor
    | RunnableMap({"prompt": lambda x: encode_prompt({"prompt": x['prompt']})})
    # Extract the tensor from the dictionary and assign to 'input_ids'
    | RunnablePassthrough.assign(input_ids=lambda x: x["prompt"])
    # Call the model with the correct arguments (input_ids)
    | (lambda x: llm(input_ids=x['input_ids']))
    # Decode the output
    | (lambda x: tokenizer.decode(x.logits[0], skip_special_tokens=True))
    | StrOutputParser()
    #Modified line
    #| (lambda x: llm.generate(input_ids=x['prompt'], max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id))
    #Modified line
    #| (lambda x: tokenizer.decode(x[0], skip_special_tokens=True))
)



In [None]:
question="What is the number of joint returns for the state of AL for income range $100,000 under $200,000?"


In [None]:
final_rag_chain.invoke({"question":question})
#final_rag_chain.invoke(question)