In [4]:
import os
import sys
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import csv_loader

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy

import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore

In [2]:
import pandas as pd

file_path = ('../data/customers-100.csv') # insert the path of the csv file
data = pd.read_csv(file_path)

#preview the csv file
data.head()

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
1,2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
2,3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
3,4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
4,5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/


In [8]:
loader = CSVLoader(file_path=file_path)
docs = loader.load_and_split()

In [5]:
# embeddings = OpenAIEmbeddings()

EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)


index = faiss.IndexFlatL2(len(HuggingFaceEmbeddings().embed_query(" ")))
vector_store = FAISS(
    embedding_function=HuggingFaceEmbeddings(),
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

  embedding_model = HuggingFaceEmbeddings(
  index = faiss.IndexFlatL2(len(HuggingFaceEmbeddings().embed_query(" ")))
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  embedding_function=HuggingFaceEmbeddings(),


In [6]:
vector_store.add_documents(documents=docs)

['ded815e5-6e09-4f3a-9936-0cc4f94dc9da',
 '07d2bc28-a809-4966-a59a-054037f82e47',
 '8864cd2a-57b0-447f-b8ab-3b136821e683',
 '477beea0-aa3e-462e-8738-795a976c8175',
 '3c8731d3-8a6b-4814-ae78-676ee0355c60',
 '854eafab-8aee-4910-9f62-e6e99cc3cfb1',
 '657295ff-43f2-487f-bf36-2221a0cd8e5c',
 'f1209f58-8b73-4428-9cc3-37b10342f5a6',
 'f734bb19-7457-4475-80e7-b747d7c216c7',
 'c9089f32-24a9-4b12-a251-dc2430f5b78f',
 '40914588-711a-4119-b64d-006d94ced1e2',
 '28b4ad6a-0426-4d11-b180-fd1222fe8b67',
 'a4cb994a-ca25-471d-a19a-51f987363b7c',
 'cabe8827-d3dd-437a-a908-75e73d674d09',
 '24742b95-449c-4581-b68d-4b1ded0d007f',
 'b75c2ed1-fdd4-4b70-aaa6-140e71dd12fa',
 'b2838bc6-8502-4b92-8350-48d5a8a6197c',
 '141e26b5-51c2-4aa5-a308-42086b4f21e5',
 'ea6a6537-5c1e-4812-b4a9-e8b6bffa128e',
 'fb141cce-01b4-4da8-8431-fb4fb84e8b25',
 'b6b6611d-d276-47a7-a34e-111e811556fd',
 'ceaa0acf-f372-42c3-8514-5c7c26b6e4b6',
 '419550f1-b159-4253-b896-666f4772b326',
 '085b18ee-a5c5-444a-a192-cd09c11156c4',
 'ba3e07a1-f0fc-

In [9]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

llm = GoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=GEMINI_API_KEY) 

retriever = vector_store.as_retriever()

# Set up system prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
    
])

# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [10]:
answer= rag_chain.invoke({"input": "which company does sheryl Baxter work for?"})
answer['answer']

'Sheryl Baxter works for Rasmussen Group.  Her customer ID is DD37Cf93aecA6Dc.  Her subscription date was August 24, 2020.\n'