In [71]:
import pandas as pd
from dotenv import load_dotenv
import os
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec


In [72]:
# ========== 1. Load environment variables ==========
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# ========== 2. Init Pinecone ==========
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-chatbot"   # same index as PDF or new one
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,   # must match embeddings dimension
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(index_name)


In [83]:
# ========== 3. Load FAQ CSV ==========
# Extract data from CSV files
def load_csv_files(data):
    dfs = []
    for file in os.listdir(data):
        if file.endswith(".csv"):
            path = os.path.join(data, file)
            df = pd.read_csv(path)
            dfs.append(df)
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    else:
        return pd.DataFrame()  # return empty DataFrame if no CSV found
    
# Get project root (parent of "src")
#base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
#data_path = os.path.join(base_dir, "data")
df = load_csv_files("../data/")
df.drop(columns=['category'], inplace=True)
df.head(19)

Unnamed: 0,qst_num,question,answer
0,1,If I open an account with CODNETWORK will I ha...,You will only need to agree to our terms and c...
1,2,What are your hours of operations?,"Warehouses: 08 AM to 06 PM, Local time. Open 6..."
2,3,Who can join us?,If you want to level up your business and get ...
3,4,Can I import goods with my supplier or should ...,Our suppliers offer a wide variety of goods th...
4,5,Services and fees?,CODNetwork offers its services hassle-free to ...
5,6,What is the process to import goods with your ...,By accessing the dashboard you can manage all ...
6,7,Leads management,Our unique platform is designed to help you ma...
7,8,Can I confirm my orders with another Call Center?,"Yes, it’s up to you. And we give you access to..."
8,9,Where are you located?,We are located in all MENA region countries:\n...
9,10,How do I calculate the shipping costs?,"Upon registration of an account, you can acces..."


In [81]:
# ========== 4. Convert FAQs to LangChain Documents ==========
def convert_faqs_to_documents(df: pd.DataFrame):
    docs = []
    for _, row in df.iterrows():
        # Use question as content, store question separately in metadata
        content = f"Question: {row['question']}"
        metadata = {
            "source": "CODNetwork FAQs",
            "qst_num": str(row["qst_num"]),
            "question": row["question"],
            "answer": row["answer"]
        }
        docs.append(Document(page_content=content, metadata=metadata))
    return docs

faq_docs = convert_faqs_to_documents(df)
faq_docs

[Document(metadata={'source': 'CODNetwork FAQs', 'qst_num': '1', 'question': 'If I open an account with CODNETWORK will I have to sign a contract or any legal papers?', 'answer': 'You will only need to agree to our terms and conditions to prove you are interested in our services. With CODNETWORK you are free to close your account at any given time, no penalties or procedure to follow, but we would really appreciate it if you can send a notice of 15 days.'}, page_content='Question: If I open an account with CODNETWORK will I have to sign a contract or any legal papers?'),
 Document(metadata={'source': 'CODNetwork FAQs', 'qst_num': '2', 'question': 'What are your hours of operations?', 'answer': 'Warehouses: 08 AM to 06 PM, Local time. Open 6 days per week\nDelivery agent: 8AM to 9PM Local time.\nOur Office: Open Monday to Saturday 10 AM to 07 PM GMT+1. Sunday off.'}, page_content='Question: What are your hours of operations?'),
 Document(metadata={'source': 'CODNetwork FAQs', 'qst_num':

In [75]:

# ========== 5. Embeddings ==========
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding = HuggingFaceEmbeddings(model_name=model_name)

In [76]:
# ========== 6. Store into Pinecone ==========
docsearch_faqs = PineconeVectorStore.from_documents(
    documents=faq_docs,
    embedding=embedding,
    index_name=index_name
)

print(f"✅ Ingested {len(faq_docs)} FAQs into Pinecone.")


✅ Ingested 19 FAQs into Pinecone.


In [86]:
import pandas as pd
from dotenv import load_dotenv
import os
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

# ========== 1. Load environment variables ==========
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# ========== 2. Init Pinecone ==========
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-chatbot"   # same index as PDF or new one
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,   # must match embeddings dimension
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(index_name)


# ========== 3. Load FAQ CSV ==========
# Extract data from CSV files
def load_csv_files(data):
    dfs = []
    for file in os.listdir(data):
        if file.endswith(".csv"):
            path = os.path.join(data, file)
            df = pd.read_csv(path)
            dfs.append(df)
    if dfs:
        return pd.concat(dfs, ignore_index=True)
    else:
        return pd.DataFrame()  # return empty DataFrame if no CSV found
    
# Get project root (parent of "src")
#base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
#data_path = os.path.join(base_dir, "data")
df = load_csv_files("../data/")
df.drop(columns=['category'], inplace=True)
df.head()

# ========== 4. Convert FAQs to LangChain Documents ==========
def convert_faqs_to_documents(df: pd.DataFrame):
    docs = []
    for _, row in df.iterrows():
        # Use question as content, store question separately in metadata
        content = f"Question: {row['question']}"
        metadata = {
            "source": "CODNetwork FAQs",
            "qst_num": str(row["qst_num"]),
            "question": row["question"],
            "answer": row["answer"]
        }
        docs.append(Document(page_content=content, metadata=metadata))
    return docs

faq_docs = convert_faqs_to_documents(df)
faq_docs


# ========== 5. Embeddings ==========
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding = HuggingFaceEmbeddings(model_name=model_name)

# ========== 6. Store into Pinecone ==========
docsearch_faqs = PineconeVectorStore.from_documents(
    documents=faq_docs,
    embedding=embedding,
    index_name=index_name,
    ids=[doc.metadata["qst_num"] for doc in faq_docs]  # use qst_num as ID
)

print(f"✅ Ingested {len(faq_docs)} FAQs into Pinecone.")


✅ Ingested 19 FAQs into Pinecone.


xdjfaxidfjxcajsckascsjjs
sjnsdjndadajdnajdna
ajdnajdnajdnajdn
adnajdnajdnajdn
ajdnajdnajdnajnd

In [77]:
# ========== 1. Load environment variables ==========
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [78]:
# ========== 2. Init Pinecone ==========
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-chatbot"   # same index as PDF or new one
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,   # must match embeddings dimension
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(index_name)

In [79]:
# ========== 5. Embeddings ==========
# 3. Embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# 4. Process each FAQ
to_upsert = []
for _, row in df.iterrows():
    qid = row["qst_num"]
    question = row["question"]
    answer = row["answer"]

# create embedding for the question
vector = model.encode(question).tolist()



metadata = {
    "question": question,
    "answer": answer,
    "source": "faq",
    "timestamp": time.time()
}

# unique id for Pinecone
vector_id = f"faq_{qid}"

to_upsert.append((vector_id, vector, metadata))




In [80]:
# 5. Upload to Pinecone
if to_upsert:
    index.upsert(to_upsert)
    print(f"✅ Successfully ingested {len(to_upsert)} FAQs into Pinecone.")
else:
    print("⚠️ No FAQs found to ingest.")


✅ Successfully ingested 1 FAQs into Pinecone.
