In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/helper-functions-zip/helper_functions.py
/kaggle/input/customers-100-csv/customers-100.csv
/kaggle/input/evaluate-rag-zip/evalute_rag.py


In [8]:
!pip install transformers faiss-cpu langchain pandas python-dotenv
!pip install -qU langchain-google-genai



In [11]:
!pip install -qU langchain-google-genai
!pip install --upgrade google-cloud google-auth langchain langchain-google-genai
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Downloading langchain_community-0.3.14-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading pydantic_settings-2.7.1-py3-none-any.whl (29 kB)
Installing collected packages: httpx-sse, pydantic-settings, langchain-community
Successfully installed httpx-sse-0.4.0 langchain-community-0.3.14 pydantic-settings-2.7.1


In [12]:
# General Utilities
import os
from pathlib import Path
import pandas as pd

# Hugging Face Embeddings
from langchain.embeddings import HuggingFaceEmbeddings

# FAISS Vector Store
import faiss
from langchain.vectorstores import FAISS
from langchain.schema import Document

# Document Loader (ensure this is compatible with Kaggle or write your own CSV loader)
from langchain_community.document_loaders.csv_loader import CSVLoader

# LLM
from langchain_google_genai import ChatGoogleGenerativeAI


In [None]:

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
GEMINI_API_KEY = user_secrets.get_secret("GEMINI_API_KEY")
print("GEMINI_API_KEY:", GEMINI_API_KEY)

In [14]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import HumanMessage

# Initialize the Gemini model
llm = ChatGoogleGenerativeAI(
    api_key=GEMINI_API_KEY,
    model="gemini-1.5-flash",  # Use the required Gemini model version
    temperature=0,  
    max_tokens=None,  
    timeout=60,  # Timeout for responses (optional)
    max_retries=2  # Retry limit for failed requests
)


In [15]:
file_path = "/kaggle/input/customers-100-csv/customers-100.csv"  # Update this path

# Load and preview the data
data = pd.read_csv(file_path)
print(data.head())

# Use a document loader or manually create documents
loader = CSVLoader(file_path=file_path)
docs = loader.load_and_split()
print(f"Loaded {len(docs)} documents.")


   Index      Customer Id First Name Last Name  \
0      1  DD37Cf93aecA6Dc     Sheryl    Baxter   
1      2  1Ef7b82A4CAAD10    Preston    Lozano   
2      3  6F94879bDAfE5a6        Roy     Berry   
3      4  5Cef8BFA16c5e3c      Linda     Olsen   
4      5  053d585Ab6b3159     Joanna    Bender   

                           Company               City  \
0                  Rasmussen Group       East Leonard   
1                      Vega-Gentry  East Jimmychester   
2                    Murillo-Perry      Isabelborough   
3  Dominguez, Mcmillan and Donovan         Bensonview   
4         Martin, Lang and Andrade     West Priscilla   

                      Country                 Phone 1                Phone 2  \
0                       Chile            229.077.5154       397.884.0519x718   
1                    Djibouti              5153435776       686-620-1820x944   
2         Antigua and Barbuda         +1-539-402-0259    (496)978-3969x58947   
3          Dominican Republic  001-8

In [22]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.in_memory import InMemoryDocstore
import faiss

# Load Hugging Face Embeddings (ensure authentication if needed)
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create FAISS Index
embedding_dim = len(hf_embeddings.embed_query(" "))  # Get embedding dimension
index = faiss.IndexFlatL2(embedding_dim)

# Initialize an in-memory docstore
docstore = InMemoryDocstore()

# Create the FAISS vector store with the required arguments
vector_store = FAISS(
    embedding_function=hf_embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id={}  # Add the missing index_to_docstore_id argument
)


Documents added to FAISS vector store.


In [23]:
vector_store.add_documents(documents=docs)
print("Documents added to FAISS vector store.")

Documents added to FAISS vector store.


In [24]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

retriever = vector_store.as_retriever()

# Set up system prompt
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
    
])

# Create the question-answer chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [29]:
query = "which company does sheryl Baxter work for?"

# Retrieve the relevant documents
retrieved_docs = retriever.get_relevant_documents(query)

# Print the retrieved data (the context documents)
for doc in retrieved_docs:
    print(doc.page_content)

Index: 1
Customer Id: DD37Cf93aecA6Dc
First Name: Sheryl
Last Name: Baxter
Company: Rasmussen Group
City: East Leonard
Country: Chile
Phone 1: 229.077.5154
Phone 2: 397.884.0519x718
Email: zunigavanessa@smith.info
Subscription Date: 2020-08-24
Website: http://www.stephenson.com/
Index: 1
Customer Id: DD37Cf93aecA6Dc
First Name: Sheryl
Last Name: Baxter
Company: Rasmussen Group
City: East Leonard
Country: Chile
Phone 1: 229.077.5154
Phone 2: 397.884.0519x718
Email: zunigavanessa@smith.info
Subscription Date: 2020-08-24
Website: http://www.stephenson.com/
Index: 9
Customer Id: C2dE4dEEc489ae0
First Name: Sheryl
Last Name: Meyers
Company: Browning-Simon
City: Robersonstad
Country: Cyprus
Phone 1: 854-138-4911x5772
Phone 2: +1-448-910-2276x729
Email: mariokhan@ryan-pope.org
Subscription Date: 2020-01-13
Website: https://www.bullock.net/
Index: 9
Customer Id: C2dE4dEEc489ae0
First Name: Sheryl
Last Name: Meyers
Company: Browning-Simon
City: Robersonstad
Country: Cyprus
Phone 1: 854-138-4911

In [31]:
answer= rag_chain.invoke({"input": "what is the phone number of sheryl Baxter?"})
answer['answer']

"Sheryl Baxter's phone numbers are 229.077.5154 and 397.884.0519x718."