In [1]:
import torch
import os
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceInferenceAPI, HuggingFaceLLM
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import Settings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.llms.llama_cpp.llama_utils import messages_to_prompt,completion_to_prompt
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core import PromptTemplate

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
documents = SimpleDirectoryReader("./pdfs/").load_data()

In [4]:
query_str = "I'm providing you with a research paper your job is to summarizes the information within it."

query_wrapper_prompt = PromptTemplate(
    "Your job is to summarize different sections of the document given to you."
    "Write a response that appropriately completes the request given to you.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

In [5]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    #You can also use others LLMs of bigger size by using Quantization through bitsandbytes
    model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf',
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    #model_path="C:\Users\manje\AppData\Local\llama_index\models\mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    #How creative the llm can be while generating responses
    temperature=0.2, 
    max_new_tokens=256,
    context_window=4096,
    generate_kwargs={},
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from C:\Users\manje\AppData\Local\llama_index\models\mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_mo

llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32000]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1
llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 2
llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0
llama_model_loader: - kv  19:               general.quantization_version u32              = 2
llama_model_loader: - type  f32:   65 tensors
llama_model_loader: - type q4_K:  193 tensors
llama_model_loader: - type q6_K:   33 tensors
llm_load_vocab: special tokens definition check successful ( 259/32000 ).
llm_load_prin

In [6]:
embed_model = LangchainEmbedding(
  HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
)

In [7]:
Settings.llm = llm
Settings.node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=5,
    window_metadata_key="window",
    original_text_metadata_key="original_text").get_nodes_from_documents(documents)
Settings.text_splitter = SentenceSplitter(chunk_size=128,chunk_overlap=20)
Settings.embed_model = embed_model

In [8]:
index = VectorStoreIndex.from_documents(documents)

In [9]:
query_engine = index.as_query_engine(similarity_top_k=5,
    verbose=True,
    node_postprocessor=[MetadataReplacementPostProcessor("window")])
response = query_engine.query("Generate a summary about the abstract")
print(F"Response: \n {response}")


llama_print_timings:        load time =   46931.19 ms
llama_print_timings:      sample time =      60.04 ms /   205 runs   (    0.29 ms per token,  3414.33 tokens per second)
llama_print_timings: prompt eval time =   79987.56 ms /   854 tokens (   93.66 ms per token,    10.68 tokens per second)
llama_print_timings:        eval time =   30113.72 ms /   204 runs   (  147.62 ms per token,     6.77 tokens per second)
llama_print_timings:       total time =  111168.64 ms /  1058 tokens


Response: 
  The abstract discusses the performance comparison of Illumina and ion torrent next-generation sequencing platforms for 16S rRNA-based bacterial community profiling. The study found that Illumina had better accuracy and reproducibility compared to ion torrent. The authors also discuss the use of real-time PCR data analysis by the comparative C(T) method and its application in clinical practice. The article highlights the need for further investigation in the area of next-generation sequencing (NGS) for lung cancer diagnosis and treatment. The authors also discuss the use of NGS in the diagnosis of advanced, progressive non-small cell lung cancer (NSCLC) with different alterations such as ALK receptor tyrosine kinase and LUAD (lung adenocarcinoma). The study also mentions the use of beads, emulsions, amplification, and magnetics (BEAMing) for NGS analysis.


In [10]:
response = query_engine.query("Generate a summary about the Methodology")
print(F"Response: \n {response}")
response = query_engine.query("Generate a summary about the Results and conclusion")
print(F"Response: \n {response}")

Llama.generate: prefix-match hit

llama_print_timings:        load time =   46931.19 ms
llama_print_timings:      sample time =      48.45 ms /   160 runs   (    0.30 ms per token,  3302.31 tokens per second)
llama_print_timings: prompt eval time =   61116.41 ms /   653 tokens (   93.59 ms per token,    10.68 tokens per second)
llama_print_timings:        eval time =   23222.35 ms /   159 runs   (  146.05 ms per token,     6.85 tokens per second)
llama_print_timings:       total time =   85171.69 ms /   812 tokens
Llama.generate: prefix-match hit


Response: 
  The article discusses the importance of analyzing real-time PCR data using the comparative C(T) method, which was first described by Schmittgen and Livak in 2008. The article also emphasizes the importance of internal validation and ongoing proficiency testing of in-house methods (laboratory-developed tests) to avoid disparities in the reliability of every kind of platform. The article recommends that the liquid biopsy report should include the platform used and all the findings of the molecular analysis. Additionally, the article mentions the use of next-generation sequencing (NGS) in analyzing large panels of targetable genetic abnormalities, which can provide added value by obtaining further useful information from the same specimen.



llama_print_timings:        load time =   46931.19 ms
llama_print_timings:      sample time =      47.44 ms /   157 runs   (    0.30 ms per token,  3309.16 tokens per second)
llama_print_timings: prompt eval time =   53254.11 ms /   564 tokens (   94.42 ms per token,    10.59 tokens per second)
llama_print_timings:        eval time =   22840.56 ms /   156 runs   (  146.41 ms per token,     6.83 tokens per second)
llama_print_timings:       total time =   76910.82 ms /   720 tokens


Response: 
  The appraisal of the current state of liquid biopsy in oncology was conducted to review strengths and weaknesses, describe what is already in clinical practice, and identify areas in need of further investigation. The report of the molecular alterations investigated in ctDNA should be thorough, complete, yet easy to interpret for optimizing therapy. The liquid biopsy report should provide information on dozens of targetable genetic abnormalities simultaneously, giving added value by obtaining further useful information from the same specimen. This is particularly relevant considering the expected rise in the enrollment of basket trials or in expanded access programs requiring molecular assignment. The study also highlights the importance of circulating mutant DNA in assessing tumor dynamics and its potential use in oncology.
