<a href="https://colab.research.google.com/github/aswinaus/LLM_Inference/blob/main/RAG_with_Evaluation_and_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import os

In [None]:
!pip install langchain langchain_community langchain_openai chromadb pymupdf nest_asyncio --quiet

from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from langchain_core.runnables import (
    RunnableParallel,
    RunnablePassthrough
)
from langchain.schema.output_parser import StrOutputParser

In [17]:
import nest_asyncio
nest_asyncio.apply()

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
import pymupdf

In [20]:
# Download Data
data_dir = '/content/drive/MyDrive' # Input a data dir path from your mounted Google Drive

In [21]:
doc = pymupdf.open(f"{data_dir}/RAG/data/TP/Intel_Financial_Statements_Year_Ended_2017.pdf")

In [22]:
#Printing the content to validate
for page in doc:
    text = page.get_text()
    #print(text)

In [23]:
import chromadb
from langchain.embeddings import OpenAIEmbeddings

In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
pages=[]
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50
)
#You will need to run this one at a time for now.
#loader = PyMuPDFLoader(f"{data_dir}/RAG/data/10k/lyft_10k_2023.pdf")
loader = PyMuPDFLoader(f"{data_dir}/RAG/data/TP/Intel_Financial_Statements_Year_Ended_2017.pdf")
# load_and_split uses RecursiveCharacterTextSplitter by default
pages_to_persist = loader.load_and_split(text_splitter)
pages.extend(pages_to_persist)

In [25]:
# split the pages into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
splits = text_splitter.split_documents(pages)

In [26]:
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [27]:
# create vector store with Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata # import filter_complex_metadata

vectordb = Chroma.from_documents(documents=pages, embedding=OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"]),persist_directory=f"{data_dir}/RAG/VectorDB/chroma_db_RAG")
vectordb.persist()
retriever = vectordb.as_retriever()

  vectordb.persist()


In [28]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [29]:
#Creating a RAG Pipeline
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
# RAG
template = """You are an AI language model Accounting assistant.Answer the following question based on this context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
llm = ChatOpenAI(temperature=0, openai_api_key=os.environ["OPENAI_API_KEY"])
final_rag_chain = (
    #{"context": retriever | format_docs, "question": RunnablePassthrough()}

    RunnablePassthrough.assign(
        context=lambda x: format_docs(vectordb.similarity_search(x["question"], k=10)),
    )

    #| RunnablePassthrough.assign(debug_context=lambda x: print(f"Context before prompt: {x['context']}"))
    | prompt
    | llm
    | StrOutputParser()
)

In [30]:
question="Can you let me know the Identified intangible assets subject to amortization and show the difference between 2016 and 2017?"

In [31]:
final_rag_chain.invoke({"question":question})

'Identified intangible assets subject to amortization for 2016 were $8,686 million, and for 2017 were $10,577 million. The difference between 2016 and 2017 is $1,891 million.'

In [33]:
questions = [
    "Can you get the total amount of Goodwill and Identified Intangible Assets?",
    "How much did Intangibles such as Goodwill and other identified intangible assets did Intel gain by acquiring Altera in millions?",
    "Can you list all the Intel Goodwill activities for year 2017 along with figures in millions?",
    "Can let me know how much was spent on Data Center Group along for 2016 and 2017 and show the difference between 2016 and 2017?",
    "Can you let me know the Identified intangible assets subject to amortization and show the difference between 2016 and 2017?",
    ]
ground_truth = [
    "The total amount of Goodwill is $10,278 million, and the total amount of Identified Intangible Assets is $7,566 million.",
    "Intel gained $13,014 million in intangibles such as Goodwill and other identified intangible assets by acquiring Altera.",
    "Sure, here are the Intel Goodwill activities for the year 2017 along with figures in millions:- Client Computing Group: $4,356;- Data Center Group: $5,421;- Internet of Things Group: $1,126;- Programmable Solutions Group: $2,490;- All other: $10,996;Total: $24,389 million",
    "In 2016, the amount spent on the Data Center Group was $7,520 million, and in 2017, it was $8,395 million. The difference between the two years is $875 million, with an increase in spending on the Data Center Group from 2016 to 2017.",
    "The Identified intangible assets subject to amortization for 2016 were $8,686 million, and for 2017, they were $10,577 million. The difference between 2016 and 2017 is $1,891 million.",
    ]

In [None]:
!pip install datasets --quiet
from datasets import Dataset

In [35]:
answers  = []
contexts = []

# traversing each question and passing into the chain to get answer from the system
for question in questions:
    answers.append(final_rag_chain.invoke({"question":question}))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(question)])

# Preparing the dataset
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truth
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

dataset.to_pandas()

  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(question)])


Unnamed: 0,question,answer,contexts,ground_truth
0,Can you get the total amount of Goodwill and I...,"The total amount of Goodwill is $10,278 millio...",[recognized as follows:\n(In Millions)\nShort-...,"The total amount of Goodwill is $10,278 millio..."
1,How much did Intangibles such as Goodwill and ...,"Intel gained a total of $13,014 million in int...",[The fair values of the assets acquired and li...,"Intel gained $13,014 million in intangibles su..."
2,Can you list all the Intel Goodwill activities...,"- Data Center Group: $2,404 million\n- Interne...","[—\n$\n4,356\nData Center Group\n2,404\n2,831\...","Sure, here are the Intel Goodwill activities f..."
3,Can let me know how much was spent on Data Cen...,"In 2016, the amount spent on the Data Center G...",[Disaggregated net revenue for each period was...,"In 2016, the amount spent on the Data Center G..."
4,Can you let me know the Identified intangible ...,Identified intangible assets subject to amorti...,"[(3,634) $\n12,745\nDecember 31, 2016\n(In Mil...",The Identified intangible assets subject to am...


In [36]:
!pip install ragas --quiet
import ragas

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/177.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.1/177.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#!git clone https://github.com/aswinaus/rag_dataset_ragas.git
#%cd rag_dataset_ragas

In [None]:
#from datasets import load_dataset
#dataset = load_dataset('json', data_files='RAGDataset.json')
#dataset = dataset['train']
#print(dataset)

In [37]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset=dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

df = result.to_pandas()
df

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_precision,context_recall,faithfulness,answer_relevancy
0,Can you get the total amount of Goodwill and I...,[recognized as follows:\n(In Millions)\nShort-...,"The total amount of Goodwill is $10,278 millio...","The total amount of Goodwill is $10,278 millio...",1.0,0.5,0.5,0.972272
1,How much did Intangibles such as Goodwill and ...,[The fair values of the assets acquired and li...,"Intel gained a total of $13,014 million in int...","Intel gained $13,014 million in intangibles su...",1.0,0.0,0.0,0.945473
2,Can you list all the Intel Goodwill activities...,"[—\n$\n4,356\nData Center Group\n2,404\n2,831\...","- Data Center Group: $2,404 million\n- Interne...","Sure, here are the Intel Goodwill activities f...",0.0,0.166667,0.0,0.875948
3,Can let me know how much was spent on Data Cen...,[Disaggregated net revenue for each period was...,"In 2016, the amount spent on the Data Center G...","In 2016, the amount spent on the Data Center G...",1.0,0.0,0.0,0.946152
4,Can you let me know the Identified intangible ...,"[(3,634) $\n12,745\nDecember 31, 2016\n(In Mil...",Identified intangible assets subject to amorti...,The Identified intangible assets subject to am...,1.0,0.333333,0.333333,0.960631


In [None]:
!pip install git+https://github.com/huggingface/transformers torch accelerate bitsandbytes langchain --quiet

In [None]:
!pip install langchain-huggingface
!pip install --upgrade transformers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers import TextIteratorStreamer
from langchain_huggingface import HuggingFacePipeline
from threading import Thread
import time
import transformers
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

first_token_time = 0
token_times = []

# Define the model ID for the desired model
model_id = f"{data_dir}/LLMs/Mistral/Mistral-Small-24B-Instruct-2501"

# Define the quantization configuration for the model
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the model using the model ID and quantization configuration
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map='auto')

# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_id)


**Saving the model and tokenizer allows to load them from my Google Drive, saving time and bandwidth.**

In [12]:
#model.save_pretrained(f"{data_dir}/LLMs/Mistral")
#tokenizer.save_pretrained(f"{data_dir}/LLMs/Mistral")

('/content/drive/MyDrive/LLMs/Mistral/tokenizer_config.json',
 '/content/drive/MyDrive/LLMs/Mistral/special_tokens_map.json',
 '/content/drive/MyDrive/LLMs/Mistral/tokenizer.json')

In [63]:
from langchain.prompts import PromptTemplate
streaming_inference(question)

Device set to use cuda:0


Context before prompt: first=VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7cb56b507f10>, search_kwargs={}) middle=[] last=RunnableLambda(format_docs)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 Based on the provided context, here is the breakdown of Identified intangible assets subject to amortization for 2016 and 2017:

### December 31, 2016:
- **Acquisition-related developed technology:**
  - Gross Assets: $7,405 million
  - Accumulated Amortization: $(1,836) million
  - Net: $5,569 million

- **Acquisition-related customer relationships:**
  - Gross Assets: $1,449 million
  - Accumulated Amortization: $(260) million
  - Net: $1,189 million

- **Acquisition-related brands:**
  - Gross Assets: $87 million
  - Accumulated Amortization: $(21) million
  - Net: $66 million

- **Licensed technology and patents:**
  - Gross Assets: $3,285 million
  - Accumulated Amortization: $(1,423) million
  - Net: $1,862 million

- **Total identified intangible assets subject to amortization:**
  - Gross Assets: $12,226 million
  - Accumulated Amortization: $(3,540) million
  - Net: $8,686 million

### December 30, 2017:
- **Acquisition-related developed technology:**
  - Gross Assets: $8,912

{'TTFT': 1.369516372680664,
 'ITL': 0.49735466132648615,
 'End-to-end Latency': 50.577397108078,
 'Throughput': 15.006703456449241}

In [62]:
def streaming_inference(question):

    context=retriever | format_docs
    #{"context": retriever | format_docs, "question": RunnablePassthrough()}
    print(f"Context before prompt: {context}")
    # Initialize a TextIteratorStreamer object for streaming text generation
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    # Create a text generation pipeline using the Hugging Face transformers library
    text_generation_pipeline = transformers.pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=0.3,
        repetition_penalty=1.1,
        max_new_tokens=1000,
        do_sample= True,
        streamer=streamer  # Use the streamer for streaming text generation
    )

    prompt_template = """
    ### [INST]
    Instruction: You are an AI language model Accounting assistant.Answer the following question based on this context.

    ### CONTEXT:
    {context}

    ### QUESTION:
    {question}

    [/INST]
    """

    mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


    prompt = ChatPromptTemplate.from_template(prompt_template)
    llm = mistral_llm
    final_rag_chain = (
        #{"context": retriever | format_docs, "question": RunnablePassthrough()}

        RunnablePassthrough.assign(
            context=lambda x: format_docs(vectordb.similarity_search(x["question"], k=10)),
        )
        | prompt
        | llm
        | StrOutputParser()
    )

    input_text = {
        "context": context,
        "question": question
    }

    # Initialize variables for time measurements
    start_time = time.time()

    # Start a new thread to invoke the language model chain with the input text
    thread = Thread(target=final_rag_chain.invoke, args=[input_text])
    thread.start()

    # Initialize a variable to store the model output
    model_output = ""

    # Iterate over the streamer to get the generated text in chunks
    for i, new_text in enumerate(streamer):
        model_output += new_text
        print(new_text, end='')

        # Measure time for the first token
        if i == 0:
            first_token_time = time.time()
        # Measure time for each token
        token_times.append(time.time())

    # Calculate end-to-end latency
    end_time = time.time()
    end_to_end_latency = end_time - start_time

    # Calculate time to first token
    ttft = first_token_time - start_time

    # Calculate inter-token latency
    itl = sum(x - y for x, y in zip(token_times[1:], token_times[:-1])) / (len(token_times) - 1)

    # Calculate throughput
    throughput = len(tokenizer.encode(model_output)) / end_to_end_latency

    print("\n")  # Add a line break
    print("Printing Inference Metrics")
    print("\nTime To First Token (TTFT):", ttft)
    print("Inter-token latency (ITL):", itl)
    print("End-to-end Latency:", end_to_end_latency)
    print("Throughput:", throughput)
    print("\n")  # Add a line break
    # Return the metrics
    return {
        "TTFT": ttft,
        "ITL": itl,
        "End-to-end Latency": end_to_end_latency,
        "Throughput": throughput
    }