In [15]:
import re
import warnings
from typing import List
 
import torch
from langchain import PromptTemplate
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.llms import HuggingFacePipeline
from langchain.schema import BaseOutputParser
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
    pipeline,
)
 
warnings.filterwarnings("ignore", category=UserWarning)

In [16]:
MODEL_NAME = "llama-2-7b-kbase-pangenome-epochs"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto"
)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model genration config
generation_config = model.generation_config
generation_config.temperature = 0
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = 3000
generation_config.use_cache = False
generation_config.repetition_penalty = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task="text-generation",
    generation_config=generation_config,
)
 
llm = HuggingFacePipeline(pipeline=generation_pipeline)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
truth = """
# taxonomy
d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas_E;s__Pseudomonas_E syringae
# environments

# sequence type
GTDB
# start ordered protein list
U  30_UPI00200C438A 50_UPI00200C438A 70_UPI00200C438A 90_A0A0P9R368 PreQ0 transporter
U unannotated
U  30_J2RFE1 50_A0A010RQZ9 70_W0MUI5 90_A0A0P9ZR62 Cytochrome C assembly family protein (Fragment)
U  30_A0A4R1F0D9 50_A0A0B2DAA0 70_A0A5E7CXU7 90_A0A656GPL0 Signal recognition particle protein
U  30_A0A132HYN2 50_A0A2P6AVB6 70_A0A285P9N0 90_A0A285P9N0 Ribosome maturation factor RimM
U  30_A0A2G9ZS82 50_A0A1E4V626 70_Q886V1 90_Q886V1 tRNA (guanine-N(1)-)-methyltransferase
U  30_A0A7C9GXV8 50_A0A3A1YK24 70_Q02RL6 90_Q88MV4 50S ribosomal protein L19
U  30_K1ZJ70 50_A0A2H9T8K8 70_A0A8T3SGZ1 90_C3K1G8 Tyrosine recombinase XerD
U  30_A0A485EFJ8 50_A0A485EFJ8 70_A0A448DQI1 90_A0A379IAU0 Thiol:disulfide interchange protein
U  30_A0A1Q9R1N9 50_A0A1Q9R1N9 70_A0A1Q9R1N9 90_A0A3M5WCN7 Homoserine dehydrogenase
U  30_A0A7T4URI0 50_A0A7T4URI0 70_A0A481QN74 90_A0A3M6G4Q3 Threonine synthase
U  30_UPI0021AC3415 50_A0A1C3JPP1 70_A0A658K122 90_A0A658K122 histidine kinase (Fragment)
U  30_Q4ZWX9 50_Q4ZWX9 70_Q4ZWX9 90_F3G1T4 EAL:response regulator receiver
U  30_A0A2J7UK61 50_A0A0N8TGA2 70_A0A0N8TGA2 90_A0A2S4I431 DNA-binding response regulator, LuxR family
U  30_U6ZRM7 50_U6ZRM7 70_A0A244EP17 90_Q48LV1 TIGR02285 family protein
U  30_UPI0014123134 50_A0A0Q0C0P6 70_A0A0Q0C0P6 90_A0A0P9VE64 YaeQ
U  30_UPI0004916252 50_UPI00140D1F63 70_A0A917PTM9 90_A0A0Q0CTR3 Single-stranded-DNA-specific exonuclease RecJ
U  30_A0A1U9MH61 50_A0A3M3TGC5 70_A0A1Y6JS96 90_A0A1Y6JS96 NADPH dehydrogenase
U  30_UPI000FFC7370 50_A0A6N7CES7 70_A0A6M8M8T5 90_A0A0Q0DUE3 histidine kinase
U  30_A0A1X0N434 50_A0A1X0N434 70_A0A3M3WIY9 90_A0A3M3WIY9 Tellurite resistance TerB family protein
U  30_A0A2V1JYY2 50_UPI000F073E35 70_A0A3M4W4X2 RING-type E3 ubiquitin transferase
U  30_A0A0Q0EF41 50_A0A0Q0EF41 70_A0A8B4AJ84 90_A0A8B4AJ84 Histidine kinase, HAMP region:Bacterial chemotaxis sensory transducer
U  30_A0A4Q7ZAU3 50_A0A4Q7ZAU3 70_A0A3M5MDV5 90_Q4ZWW8 CheW-like protein
U  30_A0A1M7LSX3 50_A0A1M7LSX3 70_A0A1M7LSX3 90_Q4ZWW7 MCP methyltransferase, CheR-type
U  30_A0A0P9TLR3 50_A0A0P9TLR3 70_A0A0P9IFU5 90_A0A0N0VT88 CheW-like domain-containing protein
U  30_A0A3M3XFR6 50_A0A3M3XFR6 70_A0A3M3XFR6 histidine kinase
"""

In [14]:
%%time
print(llm("""I have some pangenome data, they are in the format # taxonomy, # environments, # sequence type and # some protein orders.
        There is some missing part marked by "???????" I need you to help me complete the full text
        the text is:
        # taxonomy
        d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas_E;s__Pseudomonas_E syringae
        # environments

        # sequence type
        GTDB
        # start ordered protein list
        U  30_UPI00200C438A 50_UPI00200C438A 70_UPI00200C438A 90_A0A0P9R368 ????
        U unannotated
        U  30_J2RFE1 50_A0A010RQZ9 70_W0MUI5 90_A0A0P9ZR62 ?????
        U  30_A0A4R1F0D9 50_A0A0B2DAA0 70_A0A5E7CXU7 90_A0A656GPL0 Signal recognition particle protein
        U  30_A0A485EFJ8 50_A0A485EFJ8 70_A0A448DQI1 90_A0A379IAU0 Thiol:disulfide interchange protein
        U  30_A0A1Q9R1N9 50_A0A1Q9R1N9 70_A0A1Q9R1N9 90_A0A3M5WCN7 Homoserine dehydrogenase
        U  30_A0A7T4URI0 50_A0A7T4URI0 70_A0A481QN74 90_A0A3M6G4Q3 Threonine synthase
        U  30_UPI0021AC3415 50_A0A1C3JPP1 70_A0A658K122 90_A0A658K122 histidine kinase (Fragment)
        U  30_Q4ZWX9 50_Q4ZWX9 70_Q4ZWX9 90_F3G1T4 EAL:response regulator receiver
        U  ????? DNA-binding response regulator, LuxR family
        U  30_U6ZRM7 50_U6ZRM7 70_A0A244EP17 90_Q48LV1 TIGR02285 family protein
        """))

?????
U  30_A0A1H2RRN9 50_A0A1H2RRN9 70_A0A1H2RRN9 90_A0A1H2RRN9 Uncharacterized protein
U  30_A0A1H2RRN9 50_A0A1H2RRN9 70_A0A1H2RRN9 90_A0A1H2RRN9 Uncharacterized protein
U  30_A0A1H2RRN9 50_A0A1H2RRN9 70_A0A1H2RRN9 90_A0A1H2RRN9 Uncharacterized protein
U  30_A0A1H2RRN9 50_A0A1H2RRN9 70_A0A1H2RRN9 90_A0A1H2RRN9 Uncharacterized protein
U  30_A0A1H2RRN9 50_A0A1H2RRN9 70_A0A1H2RRN9 90_A
CPU times: user 3min 41s, sys: 6.55 ms, total: 3min 41s
Wall time: 3min 41s
