In [1]:
import os
import sys
import json
import random
import re
import string
import tqdm
import argparse
import numpy as np
import pandas as pd
from multiprocessing import Pool
from rouge_score import rouge_scorer

import transformers
from transformers import AutoTokenizer, GenerationConfig, LlamaForCausalLM, LlamaTokenizer

import torch
from peft import PeftModel

random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

device = torch.device("cuda")
tokenizer = GPT2Tokenizer.from_pretrained("stanford-crfm/BioMedLM")
model = GPT2LMHeadModel.from_pretrained("stanford-crfm/BioMedLM").to(device)

In [3]:
args = {
    "batch_dir": "data/gpt3_generations/",
    "seed_tasks_path": "data/help-galaxy-qa.jsonl",
    "num_instructions_to_generate": 1,
    "num_prompt_instructions": 1,
    "request_batch_size": 20,
    "use_clf_seed_tasks_only": False
}

print(args)

def encode_prompt(prompt_instructions, classification=False):
    """Encode multiple prompt instructions into a single string."""
    if classification:
        prompt = "Come up with a series of classification tasks. Try to specify the possible output labels when possible.\n"
    else:
        #prompt = "Come up with a series of tasks:\n"
        prompt = "Come up with a series of tasks: \n"
        #"Come up with a new question using scientific keywords mentioned in existing tasks: \n"
        #"Come up with a series of new tasks using scientific keywords mentioned in existing tasks: \n" 
        #"Come up with a series of new tasks: \n"
    for idx, instruction in enumerate(prompt_instructions):
        instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
        prompt += f"{idx+1}. {instruction}\n"
    return prompt


def sample_machine_instructions(machine_instructions, similarities, n):
    """Sample n machine instructions from a list of machine instructions."""
    return random.sample(machine_instructions, min(n, len(machine_instructions)))


def find_word_in_string(w, s):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search(s)

{'batch_dir': 'data/gpt3_generations/', 'seed_tasks_path': 'data/help-galaxy-qa.jsonl', 'num_instructions_to_generate': 1, 'num_prompt_instructions': 1, 'request_batch_size': 20, 'use_clf_seed_tasks_only': False}


In [4]:
def evaluate(
    instruction,
    input=None,
    temperature=0.1, #0.7, #0.1,
    top_p=0.75, #0.5, #0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128, #128,
    #max_tokens=1024,
    stream_output=False
):
    prompt = instruction
    print(prompt)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
    )

    generate_params = {
        "input_ids": input_ids,
        "generation_config": generation_config,
        "return_dict_in_generate": True,
        "output_scores": True,
        "max_new_tokens": max_new_tokens,
    }

    # Without streaming
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return output #prompter.get_response(output)

In [5]:
def evaluate_pubmedgpt(instruction):
    input_ids = tokenizer.encode(
        instruction, return_tensors="pt").to(device)
    sample_output = model.generate(input_ids, do_sample=True, max_length=128, temperature=0.7, top_k=50)
    print("Output:\n" + 100 * "-")
    return tokenizer.decode(sample_output[0])

In [6]:
seed_tasks = [json.loads(l) for l in open(args["seed_tasks_path"], "r")]
if args["use_clf_seed_tasks_only"]:
    seed_tasks = [t for t in seed_tasks if t["is_classification"]]
seed_instructions = [t["instruction"] for t in seed_tasks]
print(f"Loaded {len(seed_instructions)} human-written seed instructions")

os.makedirs(args["batch_dir"], exist_ok=True)

Loaded 5 human-written seed instructions


In [7]:
machine_instructions = []
batch_inputs = []
for _ in range(args["request_batch_size"]):
    prompt_instructions = []
    prompt_instructions += random.sample(seed_instructions, args["num_prompt_instructions"] - len(prompt_instructions))
    random.shuffle(prompt_instructions)
    prompt = encode_prompt(prompt_instructions, classification=args["use_clf_seed_tasks_only"])
    print(evaluate_pubmedgpt(prompt))
    print("---------")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I setup a bed file (see below), but it does not work. I wonder how to correct it? Thanks. HPV16 1024 1043 HPV_1_LEFT 1 + GCDCARGGDCAYAAYAATGG HPV16 1183 1207 HPV_1_RIGHT 1 - GAAAAATAAACTGTAAATCATATTC

2. I'm going to show you a file of the HPV16 reference genome (NC_001526.2) and a file of the HPV
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I setup a bed file (see below), but it does not work. I wonder how to correct it? Thanks. HPV16 1024 1043 HPV_1_LEFT 1 + GCDCARGGDCAYAAYAATGG HPV16 1183 1207 HPV_1_RIGHT 1 - GAAAAATAAACTGTAAATCATATTC

2. To analyze the next sequence you will need a list of features that contain the information you want to extract. This list can be found on the HPV webs
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I have just done a set of nanopore sequencing with PCR based cDNAseq. Any suggestion for analysis? Can minimap2 do this job. I have the genomic assembly/interested contigs.

2. I would like to see a discussion about the fact that the authors chose to use a reference genome assembly that is more distant in evolutionary history to the species you are looking at. Is that why they have to use a reference genome assembly that is more distant?

3. If you are going to make a comparison
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. The Shovill tool does not work: Loading tool toolshed.g2.bx.psu.edu/repos/iuc/shovill/shovill/1.1.0+galaxy1 failed: Error: Internal Server Error (500). I wonder how it can go back to normal?

2.  The Shovill tool does not work: Loading tool toolshed.g2.bx.psu.edu/repos/iuc
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. The Shovill tool does not work: Loading tool toolshed.g2.bx.psu.edu/repos/iuc/shovill/shovill/1.1.0+galaxy1 failed: Error: Internal Server Error (500). I wonder how it can go back to normal?

2.  \- What is the time required to run the Shovill tool?

3.  \- How would the Shovill tool perform on
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. The Shovill tool does not work: Loading tool toolshed.g2.bx.psu.edu/repos/iuc/shovill/shovill/1.1.0+galaxy1 failed: Error: Internal Server Error (500). I wonder how it can go back to normal?

3.  This is not a proper benchmark: A simple dataset is used, which is not representative of a real application.

4.  The results are not reported:
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
Input length of input_ids is 179, but `max_length` is set to 128. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I setup a bed file (see below), but it does not work. I wonder how to correct it? Thanks. HPV16 1024 1043 HPV_1_LEFT 1 + GCDCARGGDCAYAAYAATGG HPV16 1183 1207 HPV_1_RIGHT 1 - GAAAAATAAACTGTAAATCATATTC
2. I am not sure what to do with the output file that I got from the command line. Thanks. HPV_1_LEFT
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I am trying to run the second step of quantification (i.e. quantifier) following the mapper module of mirDeep2. In the mapping step everything is fine and I get the collapsed reads to be given to the quantifier module. Then, when trying to run the quantifier module, providing mature, precursor and star sequences, the run ends with an error which apparently lacks any suggestions on where the error is. Any clues on what’s going on here? I see a lot of unmapped sequences but this is somewhat expected from a biological point of view as the miRNA fraction is not usually the most abundant in the tissue I am investigating (zebrafish gonads)


---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
Input length of input_ids is 179, but `max_length` is set to 128. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I setup a bed file (see below), but it does not work. I wonder how to correct it? Thanks. HPV16 1024 1043 HPV_1_LEFT 1 + GCDCARGGDCAYAAYAATGG HPV16 1183 1207 HPV_1_RIGHT 1 - GAAAAATAAACTGTAAATCATATTC

2. My job is to check the quality of the data. The HPV16-genome has about 7000 bases.

I have a problem with the
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
Input length of input_ids is 187, but `max_length` is set to 128. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I am trying to run the second step of quantification (i.e. quantifier) following the mapper module of mirDeep2. In the mapping step everything is fine and I get the collapsed reads to be given to the quantifier module. Then, when trying to run the quantifier module, providing mature, precursor and star sequences, the run ends with an error which apparently lacks any suggestions on where the error is. Any clues on what’s going on here? I see a lot of unmapped sequences but this is somewhat expected from a biological point of view as the miRNA fraction is not usually the most abundant in the tissue I am investigating (zebrafish gonads)


---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Hello, galaxy runs properly from my server, but when I try to access ir from local network I got error: “No supported WebSocket library detected. Please use ‘pip install uvicorn[standard]’, or install ‘websockets’ or ‘wsproto’ manually.” I think I have a problem with proxy_pass parameter: http://unix:/srv/galaxy/var/gunicorn.sock; Can you help me to properly configure this value? My galaxy is located in directory /home/biodata/galaxy. Should I change http://unix: to http://localhost:8080 or other value?


---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I setup a bed file (see below), but it does not work. I wonder how to correct it? Thanks. HPV16 1024 1043 HPV_1_LEFT 1 + GCDCARGGDCAYAAYAATGG HPV16 1183 1207 HPV_1_RIGHT 1 - GAAAAATAAACTGTAAATCATATTC

2\. Then, I opened a new folder, in which I added a new sample name, "Unique" (you can change that
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I setup a bed file (see below), but it does not work. I wonder how to correct it? Thanks. HPV16 1024 1043 HPV_1_LEFT 1 + GCDCARGGDCAYAAYAATGG HPV16 1183 1207 HPV_1_RIGHT 1 - GAAAAATAAACTGTAAATCATATTC

2\. I put the program to run to generate a list of HPV16 variants, and it does not work. Thanks. HPV16 1024
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I setup a bed file (see below), but it does not work. I wonder how to correct it? Thanks. HPV16 1024 1043 HPV_1_LEFT 1 + GCDCARGGDCAYAAYAATGG HPV16 1183 1207 HPV_1_RIGHT 1 - GAAAAATAAACTGTAAATCATATTC

2. I will load the sequences of the HPV16 and HPV18 genomes into the Bowtie 2 version 2.3.4.1. I
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. The Shovill tool does not work: Loading tool toolshed.g2.bx.psu.edu/repos/iuc/shovill/shovill/1.1.0+galaxy1 failed: Error: Internal Server Error (500). I wonder how it can go back to normal?

2.  Rephrase the description of the Shovill tool:

3.  How do you use the tool?

4.  What does the tool
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
Input length of input_ids is 179, but `max_length` is set to 128. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. The Shovill tool does not work: Loading tool toolshed.g2.bx.psu.edu/repos/iuc/shovill/shovill/1.1.0+galaxy1 failed: Error: Internal Server Error (500). I wonder how it can go back to normal?

2.  The Shovill tool needs to be developed further: Loading tool toolshed.g2.bx.psu.edu/repos/
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
Input length of input_ids is 179, but `max_length` is set to 128. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I am trying to run the second step of quantification (i.e. quantifier) following the mapper module of mirDeep2. In the mapping step everything is fine and I get the collapsed reads to be given to the quantifier module. Then, when trying to run the quantifier module, providing mature, precursor and star sequences, the run ends with an error which apparently lacks any suggestions on where the error is. Any clues on what’s going on here? I see a lot of unmapped sequences but this is somewhat expected from a biological point of view as the miRNA fraction is not usually the most abundant in the tissue I am investigating (zebrafish gonads)


---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I am trying to run the second step of quantification (i.e. quantifier) following the mapper module of mirDeep2. In the mapping step everything is fine and I get the collapsed reads to be given to the quantifier module. Then, when trying to run the quantifier module, providing mature, precursor and star sequences, the run ends with an error which apparently lacks any suggestions on where the error is. Any clues on what’s going on here? I see a lot of unmapped sequences but this is somewhat expected from a biological point of view as the miRNA fraction is not usually the most abundant in the tissue I am investigating (zebrafish gonads)


---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
Input length of input_ids is 179, but `max_length` is set to 128. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I setup a bed file (see below), but it does not work. I wonder how to correct it? Thanks. HPV16 1024 1043 HPV_1_LEFT 1 + GCDCARGGDCAYAAYAATGG HPV16 1183 1207 HPV_1_RIGHT 1 - GAAAAATAAACTGTAAATCATATTC

2. I also want to change the format of the sequence and the header of the file. Thanks. HPV16 1024 1043 HPV_
---------
Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. I am trying to run the second step of quantification (i.e. quantifier) following the mapper module of mirDeep2. In the mapping step everything is fine and I get the collapsed reads to be given to the quantifier module. Then, when trying to run the quantifier module, providing mature, precursor and star sequences, the run ends with an error which apparently lacks any suggestions on where 