In [None]:
!pip -q install langchain tiktoken chromadb pypdf transformers InstructorEmbedding
!pip -q install accelerate bitsandbytes

## import libraries

In [None]:
import pandas as pd
import os
import chromadb

from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFacePipeline

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline

## read files and formating

In [None]:
# import our dataset
df = pd.read_csv("final_result.csv", index_col=[0])

In [None]:
ids = []
list_elements = []

# create the elements to store
for i, row in df.iterrows():
    list_elements.append(f"question: {row['Question']}\nanswer: {row['Answer']}")
    ids.append(f"id{i+1}")


In [None]:
!pip install sentence_transformers



## create class for chroma db functions

In [None]:
import chromadb
from chromadb.utils import embedding_functions

In [None]:
class ClientDb:

    def __init__(self, path, ef):
        self.client = chromadb.PersistentClient(path=path)
        self.collection = any
        self.ef = ef


    def create_or_get_collection(self, name):
        #create the collection
        self.collection = self.client.get_or_create_collection(
            name=name,
            embedding_function=self.ef,
            metadata={"hnsw:space": "cosine"}
        )


    def store_vectores(self, list_elements, ids):
        self.collection.add(
            documents=list_elements,
            ids=ids
        )


    def search(self, query, n_results):

        results = self.collection.query(
            query_texts=query,
            n_results=n_results
        )

        return results['documents'][0]

    def get_collection(self):
        return self.collection

## storing text formated as vectores inside CHROMADB

In [None]:
#variables
path = "chromadb"
collection_name = 'interview_qa'

#uses base model and cpu
ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-base", device="cuda")
# instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")

client = ClientDb(path, ef)
client.create_or_get_collection(collection_name)
print(f'collection created : {client.get_collection()}')

client.store_vectores(list_elements, ids)

print("elements stored")

Downloading (…)62736/.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Downloading (…)15e6562736/README.md:   0%|          | 0.00/66.2k [00:00<?, ?B/s]

Downloading (…)e6562736/config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)62736/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.43k [00:00<?, ?B/s]

Downloading (…)6562736/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512
collection created : name='interview_qa' id=UUID('910161ac-1136-4929-bd01-acef812a2f8a') metadata={'hnsw:space': 'cosine'}
elements stored


## using langchain for retreivals

In [None]:
#variables
path = "chromadb"
collection_name = 'interview_qa'

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
persistent_client = chromadb.PersistentClient(path=path)

vectordb = Chroma(
    client=persistent_client,
    collection_name="interview_qa",
    embedding_function=instructor_embeddings,
    collection_metadata={"hnsw:space": "cosine"}
)

query= 'what is overfitting?'
# docs = vectorstore.similarity_search(query, 3)
# print(docs)

retriever = vectordb.as_retriever(search_kwargs={"k": 3})

print(retriever)

result = vectordb.max_marginal_relevance_search(query,k=3, fetch_k=4)

print(result)

load INSTRUCTOR_Transformer
max_seq_length  512
tags=['Chroma', 'HuggingFaceInstructEmbeddings'] vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7e1c6534fbe0> search_kwargs={'k': 3}
[Document(page_content='question: can you avoid overfitting your model? if yes, then how?\nanswer: Yes, it is possible to overfit data models. The following techniques can be used for that purpose.'), Document(page_content='question: list down the conditions for overfitting and underfitting.\nanswer: Overfitting: The model performs well only for the sample training data. If any new data is given as input to the model, it fails to provide any result. These conditions occur due to low bias and high variance in the model. Decision trees are more prone to overfitting.Underfitting: Here, the model is so simple that it is not able to identify the correct relationship in the data, and hence it does not perform well even on the test data. This can happen due to high bias and low variance. Linear regre

### loading LLM

In [None]:
#load model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base",
                                              load_in_8bit=True,
                                              device_map='auto',)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)

# retvieve using LLM and chat using qa_chain

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=local_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
print(qa_chain)

combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x7e1c62017ee0>)), document_variable_name='context') return_source_documents=True retriever=VectorStoreRetriever(tags=['Chroma', 'HuggingFaceInstructEmbeddings'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7e1c6534fbe0>, search_kwargs={'k': 3})


In [None]:
query = "What is data science?"
llm_response = qa_chain(query)



In [None]:
llm_response["result"]

'An interdisciplinary field that constitutes various scientific processes, algorithms, tools, and machine learning techniques working to help find common patterns and gather sensible insights from the given raw input data using statistical and mathematical analysis is called Data Science.'

In [None]:
#variables
path = "chromadb"
collection_name = 'interview_qa'

#uses base model and cpu
ef = embedding_functions.InstructorEmbeddingFunction(model_name="hkunlp/instructor-base")

client = ClientDb(path, ef)
client.create_or_get_collection(collection_name)
print(f'collection created : {client.get_collection()}')

#query
query= 'what is overfitting?'

results = client.search(query, 3)

print(results)

load INSTRUCTOR_Transformer
max_seq_length  512
collection created : name='interview_qa' id=UUID('910161ac-1136-4929-bd01-acef812a2f8a') metadata={'hnsw:space': 'cosine'}
['question: can you avoid overfitting your model? if yes, then how?\nanswer: Yes, it is possible to overfit data models. The following techniques can be used for that purpose.', 'question: list down the conditions for overfitting and underfitting.\nanswer: Overfitting: The model performs well only for the sample training data. If any new data is given as input to the model, it fails to provide any result. These conditions occur due to low bias and high variance in the model. Decision trees are more prone to overfitting.Underfitting: Here, the model is so simple that it is not able to identify the correct relationship in the data, and hence it does not perform well even on the test data. This can happen due to high bias and low variance. Linear regression is more prone to Underfitting.', 'question: what is linear regre

In [None]:
for result in results:
  print(result)

question: can you avoid overfitting your model? if yes, then how?
answer: Yes, it is possible to overfit data models. The following techniques can be used for that purpose.
question: list down the conditions for overfitting and underfitting.
answer: Overfitting: The model performs well only for the sample training data. If any new data is given as input to the model, it fails to provide any result. These conditions occur due to low bias and high variance in the model. Decision trees are more prone to overfitting.Underfitting: Here, the model is so simple that it is not able to identify the correct relationship in the data, and hence it does not perform well even on the test data. This can happen due to high bias and low variance. Linear regression is more prone to Underfitting.
question: what is linear regression? what are some of the major drawbacks of the linear model?
answer: Linear regression is a technique in which the score of a variable Y is predicted using the score of a predic