In [None]:
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone.grpc import PineconeGRPC as Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.document_loaders import PyPDFLoader, DirectoryLoader  # Load documents from a directory
from langchain.text_splitter import RecursiveCharacterTextSplitter  # Split documents into smaller chunks

print


In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\abhikamuni\\Documents\\OcuAI\\medical-chatbot-AI'

In [5]:
#Extract text from a PDF file
def load_pdf_file(data):
    loader = DirectoryLoader(data, 
                             glob = "*.pdf",   #LOAD ALL PDF FILES IN THE DIRECTORY
                             loader_cls=PyPDFLoader) # EXTRACT TEXT FROM PDF FILES
    documents = loader.load()
    return documents


In [16]:
extracted_data = load_pdf_file(data='Data/') #


In [17]:
#split the Data into smaller chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
    texts_chunks = text_splitter.split_documents(extracted_data)
    return texts_chunks

In [18]:
#total number of chunks
texts_chunks = text_split(extracted_data)
print(f"Total number of chunks: {len(texts_chunks)}")

Total number of chunks: 8646


In [20]:
#download the embeddings model from HuggingFace
def download_hugging_face_embeddings():
    embeddngs = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddngs

In [10]:
embeddings = download_hugging_face_embeddings()

  embeddngs = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [21]:
#test the embeddings
query_result = embeddings.embed_query("Hello World")
print("length of the query result:", len(query_result))

length of the query result: 384


  return forward_call(*args, **kwargs)


In [22]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
# Get Pinecone API key from environment variables
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

# Get GitHub Token from environment variables
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")

# Define the GitHub AI model inference endpoint and model name
GITHUB_AI_ENDPOINT = "https://models.github.ai/inference"
# Explicitly setting the model name to Phi-3-small-8k-instruct
GITHUB_AI_MODEL_NAME = "Phi-3-small-8k-instruct"

# Set environment variables for Pinecone (if not already set globally)
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [22]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os
# Initialize Pinecone
pc = Pinecone(api_key= PINECONE_API_KEY)
index_name = "medicalbot"

pc.create_index(
    name=index_name,
    dimension=384,  # Dimension of the embeddings
    metric="cosine",  # Similarity metric
    spec = ServerlessSpec(
        cloud="aws",
        region="us-east-1",
    )
)

{
    "name": "medicalbot",
    "metric": "cosine",
    "host": "medicalbot-yk7y5p0.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [15]:
#Embed each chumnk and upsert the embeddings to Pinecone index
from langchain_pinecone import PineconeVectorStore

decsearch = PineconeVectorStore.from_documents(
    documents=texts_chunks,
    index_name = index_name,
    embedding=embeddings,
)

KeyboardInterrupt: 

In [None]:
# Load Existing index - assuming the index 'medicalbot' already exists and is populated
decsearch = PineconeVectorStore(
    index_name=index_name,
    embedding=embeddings,
)

In [None]:

# Create a retriever for similarity search
retriever = decsearch.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Test the retriever
retrieved_docs = retriever.invoke("what is Acne?")
print(retrieved_docs)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# --- LLM Setup (Using GitHub AI Model) ---
# Initialize the ChatOpenAI client for GitHub AI model
# Using Phi-3-small-8k-instruct model
llm = ChatOpenAI(
    temperature=0.4,
    max_tokens=500,
    model=GITHUB_AI_MODEL_NAME, # This is now "Phi-3-small-8k-instruct"
    openai_api_key=GITHUB_TOKEN,
    openai_api_base=GITHUB_AI_ENDPOINT
)


In [None]:


# Define the system prompt for the chatbot
sytem_prompt = (
    "You are a helpful assistant that answers questions-answers tasks. "
    "Use the following pieces of retrieved information to answer the question. "
    "If you don't know the answer, just say that you don't know, don't try to make up an answer."
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

In [None]:
# Create the chat prompt template.
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", sytem_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
# Create the document chain and retrieval chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [None]:
# --- Run the Chatbot ---
# Invoke the RAG chain with a question
response = rag_chain.invoke({"input": "What is Acne?"})
print(response['answer'])