# Requirements

In [None]:
!pip install langchain==0.2.5 langchain-community==0.2.5 langchain-huggingface==0.0.3 chromadb==0.5.3 transformers==4.41.2 pypdf==4.2.0 torch

In [None]:
!pip install accelerate==0.21.0 bitsandbytes==0.41.3

In [4]:
import os
import getpass
import matplotlib.pyplot as plt
from textwrap import dedent

from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader
from langchain_huggingface import HuggingFaceEndpoint, HuggingFacePipeline
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)

In [5]:
from google.colab import userdata
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_TOKEN')

# Prompt

In [64]:
# use llm knowledge
template1 = dedent("""
      You are an AI asisstant.
      Use the following pieces of context to answer the question at the end.
      If the context is not relevant,
      please answer the question by using your own knowledge about the topic.
      avoid giving additional answers that are not necessary.

      {context}

      Question: {question}

      Answer:
""")

# say i don't know
template2 = dedent("""
      You are an AI asisstant.
      Use the following pieces of context to answer the question at the end.
      If the context is not relevant, dont try to use your own knowledge and simply say i don't know.
      If the question needs to be answered part by part, try dividing it into different sections and then explain each section.

      {context}

      Question: {question}

      Answer:
""")

prompt_template = template2

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
    )

chain_type_kwargs = {"prompt": PROMPT}  # chain_type_kwargs will be pass on to Retreival QA chain

# Loader - Embeddings - VectoreStore - Retriever

In [40]:
loader = PyPDFLoader("SOFTWARE.PDF")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)   # chunk_overlap : size of overlaps between 2 chunks
texts = text_splitter.split_documents(documents)

In [9]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# create a Chroma vectorstore
db = Chroma.from_documents(texts, embeddings)

# search_type = similarity / mmr
retriever = db.as_retriever(search_type="similarity")

# Quantization

In [15]:
################################################################################
# bitsandbytes parameters
################################################################################

# # Activate 4-bit precision base model loading
# use_4bit = True

# # Compute dtype for 4-bit base models
# bnb_4bit_compute_dtype = "float16"

# # Quantization type (fp4 or nf4)
# bnb_4bit_quant_type = "nf4"

# # Activate nested quantization for 4-bit base models (double quantization)
# use_nested_quant = False

################################################################################

# Load the entire model on the GPU 0
# device_map = {"": 0}

In [11]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, "float16")
use_4bit = True

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,                        # Activate 4-bit precision base model loading
    bnb_4bit_quant_type="nf4",                    # Quantization type (fp4 or nf4)
    bnb_4bit_compute_dtype=compute_dtype,         # Compute dtype for 4-bit base models
    bnb_4bit_use_double_quant=False,              # Activate nested quantization for 4-bit base models (double quantization)
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)


# Model

In [12]:
model_name = "NousResearch/Meta-Llama-3-8B"

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}                # Load the entire model on the GPU 0
    )


# Model configs
model.config.use_cache = False
model.config.pretraining_tp = 1
model.config.max_new_tokens = 512    # fixed default max_new_tokens

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

# Tokenizer

In [13]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
    )

# Tokenizer configs
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Pipeline

In [74]:
# Create pipeline using transformers (huggingface)
pipe = pipeline(
    task="text-generation",
    model=model,          # model = model that we created using AutoModelForCausalLM.from_pretrained
    tokenizer=tokenizer,   # tokenizer = tokenizer that we created using AutoTokenizer.from_pretrained
    return_full_text=False,
    max_new_tokens=256,
    repetition_penalty=1.2
    )


In [75]:
# pass the pipeline created in transformers to langchain
llm = HuggingFacePipeline(
    pipeline = pipe,
    model_kwargs={"temperature":0.5}
    )

# Retrieval QA

In [76]:
qa_llama = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs,
    return_source_documents=True
    )

In [None]:
# EXAMPLE: what is agile?
# EXAMPLE: what is the difference between extreme programming and scrum?
query = input("your input: ")
result = qa_llama.invoke(query)

# Process result

In [72]:
print("Question:")
print(result['query'])

Question:
what is the difference between extreme programming and scrum?


In [73]:
print("Answer:")
print(result['result'])

Answer:
Scrum is an agile framework for project management that emphasizes teamwork, accountability, and iterative progress toward a well-deﬁ  ned goal. Extreme programming (XP) is a software development methodology that emphasizes teamwork, accountability, and iterative progress toward a well-deﬁ  ned goal. Both Scrum and XP emphasize teamwork, accountability, and iterative progress toward a well-deﬁ  ned goal. However, Scrum is a project management framework, while XP is a software development methodology. Scrum is used to manage projects, while XP is used to develop software. Scrum is a project management framework, while XP is a software development methodology. Scrum is used to manage projects, while XP is used to develop software. Scrum is a project management framework, while XP is a software development methodology. Scrum is used to manage projects, while XP is used to develop software. Scrum is a project management framework, while XP is a software development methodology. Scr

In [49]:
sources = result['source_documents']
print("\nSources:\n")
for source in sources:
  print(source.page_content)
  print("###################################")
  print(f"# SOURCE FILE: {os.path.basename(source.metadata['source'])}")
  print(f"# PAGE: {source.metadata['page']}")
  print("###################################")
  print("\n\n\n")
  print("_"*100)


Sources:

72 PART ONE  THE SOFTWARE PROCESS
 If you have further interest, see [Hig01], [Hig02a], and [DeM02] for an enter-
taining summary of other important technical and political issues. 
        5.4 E XTREME  PROGRAMMING         
  In order to illustrate an agile process in a bit more detail, we’ll provide you with 
an overview of  Extreme Programming   (XP), the most widely used approach to 
agile software development. Although early work on the ideas and methods as-sociated with XP   occurred during the late 1980s, the seminal work on the subject 
has been written by Kent Beck [Bec04a]. A variant of XP, called  Industrial XP   
(IXP), reﬁ  nes XP and targets the agile process speciﬁ  cally for use within large 
organizations [Ker05]. 
  5.4.1 The XP Process 
 Extreme Programming uses an object-oriented approach (Appendix 2) as its pre-
ferred development paradigm and encompasses a set of rules and practices that occur within the context of four framework activities: planning, d