# 🔍 PubMed Search and Embedding Workflow

This notebook demonstrates a pipeline to search PubMed, download relevant documents, process PDFs, generate embeddings using OpenAI, and store them in Pinecone.

In [1]:
# Standard Library
import os
import gc
import subprocess

# Third-party Libraries
import pandas as pd
from tqdm import tqdm
from Bio import Entrez
import pubchempy as pcp
from openai import OpenAI

from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

from langchain_pinecone import PineconeVectorStore
from rich import print

# Local Modules
from demo_script import (
    search_n_download,
    run_bash_script,
    pubmed_search_get_id,
    process_pdf,
    create_embeddings,
    upsert_embeddings
)


ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load API Keys from Environment Variables

client = OpenAI(api_key="sk-proj-T0Ocs*********aPDU9DATXAA")
MODEL = "text-embedding-ada-002"
pc = Pinecone(api_key="pcsk_4bbPdM_M8M*********xyzg1y4DYfi36SaSs6TTk8sQB")


In [78]:
smile_list = pd.read_csv("./DIA_trainingset_RDKit_descriptors.csv")

In [79]:
list(smile_list["SMILES"][1:5])

['C[C@H](N(O)C(=O)N)c1cc2ccccc2s1',
 'C[N+](C)(C)CC(=O)[O-]',
 'CC(C)n1c(\\C=C\\[C@H](O)C[C@H](O)CC(=O)O)c(c2ccc(F)cc2)c3ccccc13',
 'C\\C(=C(\\C#N)/C(=O)Nc1ccc(cc1)C(F)(F)F)\\O']

In [None]:
email = 'your_email@mail.com'
script_path = './down_pdf.sh' 
index_name = "hs-codes3" ## Change this to your desired index name
folder = "./research_papers"


if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )


index = pc.Index(index_name)

for i in tqdm(list(smile_list["SMILES"][1:5])):

    search_n_download(i,email,script_path,10)  ### takes percent amount of the search results to download

    pdf_files = [f for f in os.listdir(folder) if f.endswith('.pdf')]
    
    for file in tqdm(pdf_files):
        file_path = "./research_papers" + file

        if os.path.getsize(file_path) == 0:
            print(f"File is empty: {file_path}")
            pass
        else:
            paper_name_tag = os.path.splitext(os.path.basename(file_path))[0]
            # print(file_path)
            texts = process_pdf(file_path)
            embeddings = create_embeddings(texts)
            upsert_embeddings(index, embeddings, texts, name_integ = paper_name_tag)

        
        os.remove(file_path)
        gc.collect()



## Now you have a pinecone index where you can search docs based on your query and also build chatbot that provides references based on your collected papers


In [None]:

# Get openai api key from platform.openai.com
OPENAI_API_KEY = "sk-proj-T0Ocs*********OcPaPDU9DATXAA"

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [None]:


pinecone_vectorstore = PineconeVectorStore(
    index_name=index_name, 
    embedding=embed, 
    text_key="text",
    pinecone_api_key = "pcsk_4bbPdM_M8MnFKNao*********iyByxyzg1y4DYfi36SaSs6TTk8sQB"
)

In [71]:


documents = pinecone_vectorstore.similarity_search(
    query="Any interesting finding on betaine ?,cite the sources and pinecone database paper too",  # our search query
    k=5  # return 3 most relevant docs
)

for doc in documents:
    print(doc.__dict__)
    print()

In [None]:
## Now you can use this vector database to answer queries with citations on sources. 

In [72]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

# Chat Completion LLM
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-4.5-preview',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=pinecone_vectorstore.as_retriever()
)

In [74]:
answer = qa.invoke("Any interesting finding on betaine ?\
    Cite the sources and are congruent with data from pinecone database")

In [75]:
from rich import print
print(answer['result'])

In [76]:
answer = qa.invoke("Any interesting finding on cd275 ?\
    Cite the sources and are congruent with data from pinecone database")

print(answer['result'])