In [3]:
import re
import warnings
from typing import List
 
import torch
from langchain import PromptTemplate
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.llms import HuggingFacePipeline
from langchain.schema import BaseOutputParser
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    StoppingCriteria,
    StoppingCriteriaList,
    pipeline,
)
 
warnings.filterwarnings("ignore", category=UserWarning)
MODEL_NAME = "pangenome-with-cluster-id"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto"
)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model genration config
generation_config = model.generation_config
generation_config.temperature = 0
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = 1000
generation_config.use_cache = False
generation_config.repetition_penalty = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    task="text-generation",
    generation_config=generation_config,
)
 
llm = HuggingFacePipeline(pipeline=generation_pipeline)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
%%time
print(llm("""Instruction: I have some pangenome data, they are in the format taxonomy, environments, and protein sequence.
        There is some missing part marked by "???" I need you to help me complete the full text
        the text is:
         "{'taxonomy': 'd__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas_E;s__Pseudomonas_E putida_B', 
         'environments': ['Host-associated:Plants:Phyllosphere:Unclassified:Unclassified', 'Host-associated:Plants:Roots:Rhizosphere:Unclassified'], 
         'ordered protein lists': [[{'pangenome id': ???, 'is core gene': False, 'function': 'methyl-accepting chemotaxis protein', 'annotations': ['UPI001C93362B', 'UPI001C93362B']}, 
         {'pangenome id': 'POCILYQJ_mmseqsCluster_3241', 'is core gene': True, 'function': 'methyl-accepting chemotaxis protein', 'annotations': ['UPI001C3EED92', 'UPI0016469710', 'UPI0016469710']}, 
         {'pangenome id': 'POCILYQJ_mmseqsCluster_5849', 'is core gene': True, 'function': ???, 'annotations': ['A0A2E5PLB0', 'A0A2E5PLB0', 'A0A109LDZ0', 'A0A089WPK3']}}
        
        The output should be
         "{'taxonomy': 'd__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas_E;s__Pseudomonas_E putida_B', 
         'environments': ['Host-associated:Plants:Phyllosphere:Unclassified:Unclassified', 'Host-associated:Plants:Roots:Rhizosphere:Unclassified'], 
         'ordered protein lists': [[{'pangenome id': 'POCILYQJ_mmseqsCluster_0833', 'is core gene': False, 'function': 'methyl-accepting chemotaxis protein', 'annotations': ['UPI001C93362B', 'UPI001C93362B']}, 
         {'pangenome id': 'POCILYQJ_mmseqsCluster_3241', 'is core gene': True, 'function': 'methyl-accepting chemotaxis protein', 'annotations': ['UPI001C3EED92', 'UPI0016469710', 'UPI0016469710']}, 
         {'pangenome id': 'POCILYQJ_mmseqsCluster_5849', 'is core gene': True, 'function': 'Peptide ABC transporter ATP-binding protein', 'annotations': ['A0A2E5PLB0', 'A0A2E5PLB0', 'A0A109LDZ0', 'A0A089WPK3']}}
        
        new text is:
        "{'taxonomy': 'd__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas_E;s__Pseudomonas_E sp003050925', 
        'environments': ['Environmental:Terrestrial:Soil:Agricultural land:Unclassified', 'Host-associated:Plants:Roots:Endosphere:Unclassified', 'Host-associated:Plants:Roots:Rhizosphere:Soil'], 
        'ordered protein lists': [[{'pangenome id': 'HNDJSHKM_mmseqsCluster_2334', 'is core gene': True, 'function': ???, 'annotations': ['A0A839T3N7', 'A0A839T3N7', 'UPI001C577F44', 'A0A5E6RAG6']},
        
        output Completed data: 
        """))

 "{'taxonomy': 'd__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas_E;s__Pseudomonas_E sp003050925', 
        'environments': ['Host-associated:Plants:Phyllosphere:Unclassified:Unclassified', 'Host-associated:Plants:Roots:Rhizosphere:Unclassified', 'Host-associated:Plants:Roots:Endosphere:Unclassified', 'Host-associated:Plants:Roots:Rhizosphere:Soil'], 
        'ordered protein lists': [[{'pangenome id': 'HNDJSHKM_mmseqsCluster_2334', 'is core gene': True, 'function':???, 'annotations': ['A0A839T3N7', 'A0A839T3N7', 'A0A5E6RAG6', 'A0A5E6RAG6']},
        {'pangenome id': 'POCILYQJ_mmseqsCluster_0833', 'is core gene': False, 'function':'methyl-accepting chemotaxis protein', 'annotations': ['UPI001C93362B', 'UPI001C93362B']},
        {'pangenome id': 'POCILYQJ_mmseqsCluster_3241', 'is core gene': True, 'function':'methyl-accepting chemotaxis protein', 'annotations': ['UPI001C3EED92', 'UPI0016469710', 'UPI0016469710']},
        {'pangen