In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
# os.environ['CURL_CA_BUNDLE'] = ''
# os.environ['CA_CERTS'] = '/Users/zhou/Dev_Work/ask_pdf/http_ca.crt'
from langchain.document_loaders import PyMuPDFLoader
from langchain import ElasticVectorSearch
from langchain.embeddings import OpenAIEmbeddings
from langchain.callbacks import get_openai_callback
import yaml

with open("./config.yml", "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)

OPENAI_API_KEY = cfg["openai"]["OPENAI_API_KEY"]
ELASTICSEARCH_URL = cfg["es"]["elasticsearch_url"]
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [2]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain


In [3]:
llm = ChatOpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)
# ?, return_source_documents=True
chain = load_qa_chain(llm, chain_type="stuff")


In [4]:
docsearch = ElasticVectorSearch(
    elasticsearch_url=ELASTICSEARCH_URL,
    index_name='company',
    embedding=embeddings,
)

In [27]:
query = "有哪些抗体"
docs = docsearch.similarity_search(query=query, include_metadata=True)
docs


[Document(page_content='The primary antibodies used included: rabbit anti-HDAC1 (Abclonal, China), mouse anti-Ac-lysine (Santa Cruz, USA), rabbit anti-acetyl-H3-K27 (Abclonal, China), rabbit anti-acetyl-H3-K9 (Abclonal, China), mouse anti-histone H3 (Beyotime), mouse anti-β-actin (Beyotime), rabbit anti-ISG15 (Beyotime), rabbit anti-Sp1 (Abclonal, China), rabbit anti-Flag tag (DYKDDDDK; CST), mouse anti-HA tag (CST), mouse anti-PEDV N monoclonal antibody (stored in our laboratory) ( 56 ).\n\nThe following primary antibodies were used: anti-ZO-1 (1:1000, #5406, Cell Signaling Technology, Danvers, MA, USA), anti-claudin-5 (1:1000, A10207, ABclonal, Wuhan, China), anti-NLRP3 (1:1000, A14223, ABclonal), anti-caspase-1 (1:1000, A0964, ABclonal), anti-IL-1β (1:1000, A16288, ABclonal), anti-Gasdermin D (GSDMD, #96458, 1:1000, Cell Signaling Technology), anti-SLC5A1 (1:1000, #5042, Cell Signaling Technology) and anti-SLC5A2 (1:1000, #14210, Cell Signaling Technology).', metadata={'source': '/U

In [28]:
with get_openai_callback() as cb:
    res = chain.run(input_documents=docs, question=query)
    print(res)
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost}")

1. rabbit anti-HDAC1
2. mouse anti-Ac-lysine
3. rabbit anti-acetyl-H3-K27
4. rabbit anti-acetyl-H3-K9
5. mouse anti-histone H3
6. mouse anti-β-actin
7. rabbit anti-ISG15
8. rabbit anti-Sp1
9. rabbit anti-Flag tag
10. mouse anti-HA tag
11. anti-ZO-1
12. anti-claudin-5
13. anti-NLRP3
14. anti-caspase-1
15. anti-IL-1β
16. anti-Gasdermin D (GSDMD)
17. anti-SLC5A1
18. anti-SLC5A2
19. NAT8
20. NAT8B
21. SLC33A1
22. GFP
23. Flag
24. anti-LIF
25. Anti-CD36
26. Anti-GAPDH
27. Anti-FTO
28. Anti-P65-S536
29. Anti-IKB-α
30. NLRP3
31. Trx
32. TXNIP
33. NOX4
34. c-MET Rabbit mAb
35. SLC7A11/xCT Rabbit mAb
36. GPX4 Monoclonal antibody
37. Anti-beta Tubulin antibody
38. anti-H3K4me1
39. anti-H3K4me2
40. anti-H3K4me3
41. rabbit monoclonal antibody against human ACTB
42. Antibodies against GOLGA2
43. Antibodies against CANX
44. Antibodies against CD63
45. Antibodies against CD81
Total Tokens: 1689
Prompt Tokens: 1313
Completion Tokens: 376
Total Cost (USD): $0.003378


<!-- 总结 -->

In [13]:
query = "The Most Representative Paragraph."
docs1 = docsearch.similarity_search(query, include_metadata=True)
docs1[0]

[Document(page_content='Molecular Cell, Volume 65\nSupplemental Information\nComparative Analysis\nof Single-Cell RNA Sequencing Methods\nChristoph Ziegenhain, Beate Vieth, Swati Parekh, Björn Reinius, Amy Guillaumet-\nAdkins, Martha Smets, Heinrich Leonhardt, Holger Heyn, Ines Hellmann, and Wolfgang\nEnard', metadata={'source': '/Users/zhou/Dev_Work/ask_pdf/mmc2.pdf', 'file_path': '/Users/zhou/Dev_Work/ask_pdf/mmc2.pdf', 'page_number': 19, 'total_pages': 33, 'format': 'PDF 1.7', 'title': 'Comparative Analysis of Single-Cell RNA Sequencing Methods', 'author': 'Christoph Ziegenhain', 'subject': 'Molecular Cell, 65 (2017) 631-647. doi:10.1016/j.molcel.2017.01.023', 'keywords': '', 'creator': 'Elsevier', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creationDate': "D:20170211005808+05'30'", 'modDate': "D:20170211005907+05'30'", 'trapped': ''}),
 Document(page_content='Article\nComparative Analysis of Single-Cell RNA\nSequencing Methods\nGraphical Abstract\nHighlights\nd The study repr

In [14]:
from langchain.chains.summarize import load_summarize_chain

chain = load_summarize_chain(llm, chain_type="stuff")
with get_openai_callback() as cb:
  display(chain.run(docs1))
  print(f"Total Tokens: {cb.total_tokens}")
  print(f"Prompt Tokens: {cb.prompt_tokens}")
  print(f"Completion Tokens: {cb.completion_tokens}")
  print(f"Total Cost (USD): ${cb.total_cost}")

'The article presents a comparative analysis of six prominent single-cell RNA sequencing methods, using data from mouse ESCs to evaluate their sensitivity and precision, and power simulations to compare their cost efficiency. The study offers an informed choice among existing protocols and provides a framework for future comparisons and benchmarking.'

Total Tokens: 770
Prompt Tokens: 713
Completion Tokens: 57
Total Cost (USD): $0.00154


<!-- 指定路径问答 -->

In [37]:
from typing import Any, Dict, Iterable, List, Optional, Tuple
from langchain.docstore.document import Document


def _default_script_query(query_vector: List[float], filter: Optional[dict]) -> Dict:
    if filter is None:
        filter = {"match_all": {}}
    else:
        (key, value), = filter.items()
        filter = {"match": {f"metadata.{key}.keyword": f"{value}"}}
    return {
        "script_score": {
            "query": filter,
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
                "params": {"query_vector": query_vector},
            },
        }
    }


def similarity_search_with_score(
    self,
    query: str,
    k: int = 4,
    filter: Optional[dict] = None,
    **kwargs: Any
) -> List[Tuple[Document, float]]:
    """Return docs most similar to query.
    Args:
        query: Text to look up documents similar to.
        k: Number of Documents to return. Defaults to 4.
    Returns:
        List of Documents most similar to the query.
    """
    embedding = self.embedding.embed_query(query)
    filter = kwargs.get("filter")
    script_query = _default_script_query(embedding, filter)
    response = self.client.search(
        index=self.index_name, query=script_query, size=k)
    hits = [hit for hit in response["hits"]["hits"]]
    # documents = [
    #     Document(page_content=hit["text"], metadata=hit["metadata"]) for hit in hits
    # ]
    docs_and_scores = [
        (
            Document(page_content=hit["_source"]["text"],
                     metadata=hit["_source"]["metadata"]),
            hit['_score']
        ) for hit in hits
    ]
    return docs_and_scores


ElasticVectorSearch.similarity_search_with_score = similarity_search_with_score


In [43]:
query = "The Most Representative Paragraph."
docs2 = docsearch.similarity_search_with_score(query, filter={"title": "Comparative Analysis of Single-Cell RNA Sequencing Methods"})
docs2[0]

(Document(page_content='Molecular Cell, Volume 65\nSupplemental Information\nComparative Analysis\nof Single-Cell RNA Sequencing Methods\nChristoph Ziegenhain, Beate Vieth, Swati Parekh, Björn Reinius, Amy Guillaumet-\nAdkins, Martha Smets, Heinrich Leonhardt, Holger Heyn, Ines Hellmann, and Wolfgang\nEnard', metadata={'source': '/Users/zhou/Dev_Work/ask_pdf/mmc2.pdf', 'file_path': '/Users/zhou/Dev_Work/ask_pdf/mmc2.pdf', 'page_number': 19, 'total_pages': 33, 'format': 'PDF 1.7', 'title': 'Comparative Analysis of Single-Cell RNA Sequencing Methods', 'author': 'Christoph Ziegenhain', 'subject': 'Molecular Cell, 65 (2017) 631-647. doi:10.1016/j.molcel.2017.01.023', 'keywords': '', 'creator': 'Elsevier', 'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creationDate': "D:20170211005808+05'30'", 'modDate': "D:20170211005907+05'30'", 'trapped': ''}),
 1.7538588)

In [42]:
with get_openai_callback() as cb:
    res = chain.run(input_documents=[d[0] for d in docs2], question=query)
    print(res)
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost}")

The most representative paragraph is the "In Brief" section, which summarizes the key points of the study. It states that the authors evaluated six prominent scRNA-seq methods using data from mouse ESCs, and used power simulations to compare cost efficiencies. The study provides an informed choice among existing protocols and a framework for future comparisons.
Total Tokens: 815
Prompt Tokens: 749
Completion Tokens: 66
Total Cost (USD): $0.00163
