In [None]:
!pip install -r requirements.txt

In [None]:
!pip install InstructorEmbedding

In [1]:
import os
import textwrap

import langchain
import chromadb
import transformers
import torch

from transformers import AutoTokenizer
from langchain import HuggingFacePipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [2]:
import csv
from langchain.docstore.document import Document 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [3]:
# Define the columns we want to embed vs which ones we want in metadata
columns_to_embed = ["Questions"]
columns_to_metadata = ["UID","Questions","WelcomeIdentifier","Type","Answer"]


In [4]:
docs = []
with open('data/general_question.csv', newline="", encoding='utf-8-sig') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    for i, row in enumerate(csv_reader):
        to_metadata = {col: row[col] for col in columns_to_metadata if col in row}
        values_to_embed = {k: row[k] for k in columns_to_embed if k in row}
        to_embed = "\n".join(f"{v.strip()}" for k, v in values_to_embed.items())
        newDoc = Document(page_content=to_embed, metadata=to_metadata)
        docs.append(newDoc)

In [5]:
docs

[Document(page_content='Who are you?', metadata={'UID': 'p93e', 'Questions': 'Who are you?', 'WelcomeIdentifier': 'N', 'Type': 'text', 'Answer': ''}),
 Document(page_content='Who built you?', metadata={'UID': 'o81p', 'Questions': 'Who built you?', 'WelcomeIdentifier': 'N', 'Type': 'text', 'Answer': ''}),
 Document(page_content='Hi, nice to meet you.', metadata={'UID': 'c527', 'Questions': 'Hi, nice to meet you.', 'WelcomeIdentifier': 'N', 'Type': 'text', 'Answer': 'Nice meeting you too. How may I help you today?'}),
 Document(page_content='Are you a human or a chatbot ?', metadata={'UID': 'zya6', 'Questions': 'Are you a human or a chatbot ?', 'WelcomeIdentifier': 'N', 'Type': 'Random', 'Answer': ''}),
 Document(page_content='who created you', metadata={'UID': 'i72o', 'Questions': 'who created you', 'WelcomeIdentifier': 'N', 'Type': 'text', 'Answer': ''}),
 Document(page_content='How is your company doing?', metadata={'UID': 'nu2k', 'Questions': 'How is your company doing?', 'WelcomeIde

In [6]:
# Lets split the document using Chracter splitting. 
splitter = CharacterTextSplitter(separator = "\n",
                                chunk_size=500, 
                                chunk_overlap=0,
                                length_function=len)
documents = splitter.split_documents(docs)

In [7]:
# from langchain.embeddings import HuggingFaceBgeEmbeddings
# model_name = "BAAI/bge-large-en-v1.5" # https://huggingface.co/BAAI/llm-embedder #https://arxiv.org/pdf/2401.00368.pdf
# model_kwargs = {'device': 'cuda'}
# encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
# embedding_function = HuggingFaceBgeEmbeddings(
#     model_name=model_name,
#     model_kwargs=model_kwargs,
#     encode_kwargs=encode_kwargs,
#     query_instruction="Represent this sentence for searching relevant passages: "
# )
# embedding_function.query_instruction = "Represent this sentence for searching relevant passages: "

In [8]:
embeddings_model = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"})


load INSTRUCTOR_Transformer
max_seq_length  512


In [9]:
# Generate embeddings from documents and store in a vector database
# embeddings_model = OpenAIEmbeddings()
db = Chroma.from_documents(documents, embeddings_model)

Using embedded DuckDB without persistence: data will be transient


In [10]:
len(db.get(include=['embeddings', 'documents', 'metadatas'])["embeddings"][1])

768

In [13]:
# Query the vector database for information.
query = "How can i meet you?"
docs = db.similarity_search_with_score(query)
print(docs[0][0].page_content)
print(docs[0][0].metadata)

Hi, nice to meet you.
{'UID': 'c527', 'Questions': 'Hi, nice to meet you.', 'WelcomeIdentifier': 'N', 'Type': 'text', 'Answer': 'Nice meeting you too. How may I help you today?'}


In [12]:
docs

[(Document(page_content='Hi', metadata={'UID': 'r5bp', 'Questions': 'Hi', 'WelcomeIdentifier': 'Y', 'Type': 'Random', 'Answer': 'Hello! How may I help you?'}),
  0.04309403523802757),
 (Document(page_content='Hello', metadata={'UID': 'k556', 'Questions': 'Hello', 'WelcomeIdentifier': 'Y', 'Type': 'Random', 'Answer': 'Hello! How may I help you?'}),
  0.2642967104911804),
 (Document(page_content='Hey', metadata={'UID': 'wo98', 'Questions': 'Hey', 'WelcomeIdentifier': 'Y', 'Type': 'Random', 'Answer': 'Hello! How may I help you?'}),
  0.42709410190582275),
 (Document(page_content='Hi, nice to meet you.', metadata={'UID': 'c527', 'Questions': 'Hi, nice to meet you.', 'WelcomeIdentifier': 'N', 'Type': 'text', 'Answer': 'Nice meeting you too. How may I help you today?'}),
  0.5961214900016785)]