In [2]:
import os
import sys
import json
import random
import re
import string
import tqdm
import argparse
import numpy as np
import pandas as pd
from multiprocessing import Pool
#from functools import partial
from rouge_score import rouge_scorer
#from gpt3_api import make_requests as make_gpt3_requests
import transformers
from transformers import AutoTokenizer, GenerationConfig, LlamaForCausalLM, LlamaTokenizer

import torch
from peft import PeftModel

random.seed(42)

In [3]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

device = torch.device("cuda")
tokenizer = GPT2Tokenizer.from_pretrained("stanford-crfm/BioMedLM")
model = GPT2LMHeadModel.from_pretrained("stanford-crfm/BioMedLM").to(device)

In [4]:
args = {
    "batch_dir": "data/gpt3_generations/",
    "seed_tasks_path": "data/gtn-qa.jsonl",
    "num_instructions_to_generate": 1,
    "num_prompt_instructions": 1,
    "request_batch_size": 20,
    "use_clf_seed_tasks_only": False
}

print(args)

def encode_prompt(prompt_instructions, classification=False):
    """Encode multiple prompt instructions into a single string."""
    if classification:
        prompt = "Come up with a series of classification tasks. Try to specify the possible output labels when possible.\n"
    else:
        #prompt = "Come up with a series of tasks:\n"
        prompt = "Come up with a series of tasks: \n"
        #"Come up with a new question using scientific keywords mentioned in existing tasks: \n"
        #"Come up with a series of new tasks using scientific keywords mentioned in existing tasks: \n" 
        #"Come up with a series of new tasks: \n"
    for idx, instruction in enumerate(prompt_instructions):
        instruction = re.sub(r"\s+", " ", instruction).strip().rstrip(":")
        prompt += f"{idx+1}. {instruction}\n"
    return prompt


def sample_machine_instructions(machine_instructions, similarities, n):
    """Sample n machine instructions from a list of machine instructions."""
    return random.sample(machine_instructions, min(n, len(machine_instructions)))


def find_word_in_string(w, s):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search(s)

{'batch_dir': 'data/gpt3_generations/', 'seed_tasks_path': 'data/gtn-qa.jsonl', 'num_instructions_to_generate': 1, 'num_prompt_instructions': 1, 'request_batch_size': 20, 'use_clf_seed_tasks_only': False}


In [5]:
def evaluate(
    instruction,
    input=None,
    temperature=0.1, #0.7, #0.1,
    top_p=0.75, #0.5, #0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128, #128,
    #max_tokens=1024,
    stream_output=False
):
    prompt = instruction
    print(prompt)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
    )

    generate_params = {
        "input_ids": input_ids,
        "generation_config": generation_config,
        "return_dict_in_generate": True,
        "output_scores": True,
        "max_new_tokens": max_new_tokens,
    }

    # Without streaming
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return output

In [6]:
def evaluate_pubmedgpt(instruction):
    input_ids = tokenizer.encode(
        instruction, return_tensors="pt").to(device)
    sample_output = model.generate(input_ids, do_sample=True, max_length=128, temperature=0.7, top_k=50)
    print("Output:\n" + 100 * "-")
    return tokenizer.decode(sample_output[0])

In [7]:
seed_tasks = [json.loads(l) for l in open(args["seed_tasks_path"], "r")]
if args["use_clf_seed_tasks_only"]:
    seed_tasks = [t for t in seed_tasks if t["is_classification"]]
seed_instructions = [t["instruction"] for t in seed_tasks]
print(f"Loaded {len(seed_instructions)} human-written seed instructions")

os.makedirs(args["batch_dir"], exist_ok=True)

Loaded 8 human-written seed instructions


In [None]:
machine_instructions = []
batch_inputs = []

for _ in range(args["request_batch_size"]):
    prompt_instructions = []
    prompt_instructions += random.sample(seed_instructions, args["num_prompt_instructions"] - len(prompt_instructions))
    random.shuffle(prompt_instructions)
    prompt = encode_prompt(prompt_instructions, classification=args["use_clf_seed_tasks_only"])
    print(evaluate_pubmedgpt(prompt))
    print("---------")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Are UMIs not specific to certain genes? Can the same UMI map to different genes?

2.  How can UMIs be more specific?

3.  Can UMIs be used to estimate the number of molecules of a gene in a sample?

4.  What is the most efficient way to count UMIs?

5.  Can UMIs be used for the absolute quantification of transcripts?

6.  Are UMIs able to accurately quantify transcripts with a low abundance?
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Why is it important to know which cell a read came from?

2. What are the characteristics of the cell?

3. What are the characteristics of the genome?

4. What are the characteristics of the transcriptome?

5. What are the characteristics of the proteome?

6. What is the purpose of the transcriptome?

7. How can you determine which cell's transcriptome is from which cell?

8. How can you determine which cell's genome is from which cell?

9. How can you
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. What do UMIs do?
---------------------

We have defined a set of UMIs for the capture of phenotypic information from electronic health records (EHRs) (see [Table 1](#t1-egems1238){ref-type="table"}). These UMIs define a set of data items and associated values that can be extracted from EHRs for the purpose of research. We have divided these UMIs into two groups: 1) data elements that are commonly collected in E
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Can the same UMI map to different mRNA molecules of the same gene?

2. Can different UMIs map to the same mRNA molecule?

3. Are all the UMIs in a single cell unique?

4. Are there multiple UMIs in a single cell which map to different mRNA molecules?

5. Can we use UMIs to obtain information about the number of mRNA molecules in a single cell?

6. Can we use UMIs to correct for PCR amplification bias?

7. Does
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Can the same UMI map to different mRNA molecules of the same gene?

2. Are the UMI sequences unique?

3. Is the UMI sequence in error (mismatch, deletion, insertion)?

4. Is the UMI present in the sequence read?

5. How many different mRNA molecules are in the sample?

6. Are there any biases in the cDNA synthesis?

7. Do the reads have the same length?

8. Do the UMI sequences match?

9
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Why do we need to barcode a read transcript too? Isn’t mapping it against the reference genome enough?

2. Which type of technology do you want to use?

3. What are the advantages and disadvantages of each technology?

4. What are the steps involved in the process of generating a barcode?

5. What are the costs for each technology?

6. What are your thoughts on how to make your technology available to the research community?


---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Are UMIs not specific to certain genes? Can the same UMI map to different genes?

2. Are there potential off-targets that can lead to artifacts?

3. Is there a way to determine the optimal number of PCR cycles?

4. How much sample is enough to get accurate results?

5. Is there a way to determine how much input material to use?

6. What is the optimal size of the PCR product?

7. Should the UMI be used for normalization
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Are UMIs not specific to certain genes? Can the same UMI map to different genes?

2. What is the error rate for UMIs? What is the percentage of UMIs that map to the wrong gene?

3. How many cells are there in each of the samples?

4. What are the number of UMIs that map to each gene?

5. Do the total numbers of genes expressed in each cell line match the expected number?

6. Is there any correlation with the
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. TB Variant Filter reads the VCF and output only SNPs that have, at least, 90% frequency. How can the software extract such information from the VCF datasets?

2.  TB Variant Filter output a series of SNPs and their corresponding frequencies. The user should be able to extract the most informative ones, according to the different criteria. For example, the user can choose to extract only non-synonymous SNPs, or only SNPs from exons or introns.

```{=html}
<!
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Why is it important to know which cell a read came from?
2. Where are the cells in the brain?

3. What is going on in the brain?

4. What are the functions of that brain?

5. What are the different cell types in the brain?

6. Where are they located?

7. Does it matter which cell type in the brain you investigate?

8. Which cells are you working with?

9. What is the most important question you want to answer
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Why is it important to know which cell a read came from?

2. How does your knowledge about a cell affect your ability to understand and interpret your data?

3. What are the different types of error in single-cell sequencing data?

Session IV: Understanding and interpreting single-cell sequencing data.

1. How do you interpret single-cell sequencing data?

2. How can data from different platforms be compared?

3. What is the relationship between different types of single-cell sequencing data?


---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Are UMIs not specific to certain genes? Can the same UMI map to different genes?

2. What is the quality of the raw data?

3. Is the protocol reproducible?

4. Is the protocol high-throughput?

5. Is the protocol affordable?

6. Is the protocol customizable?

7. Can the protocol be modified to target other types of genes or organisms?

8. Does the protocol enable single-cell genome sequencing?

9. Is there
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Can the same UMI map to different mRNA molecules of the same gene?

2. Can the same UMI map to different mRNA molecules of the same gene?

3. Can UMIs map to multiple genes?

To address these questions, we first constructed a custom UMI library from a human kidney reference RNA sample. We then sequenced 10 ng of this UMI library on the HiSeq4000 system to obtain single nucleotide resolution UMI counts.

In order to estimate the error rate of UMIs, we calculated the
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Can the same UMI map to different mRNA molecules of the same gene?

2. Does a single mRNA molecule give rise to different proteins?

3. How many different mRNA molecules can be generated from a single gene?

4. What is the frequency of occurrence of each of these mRNA molecules?

5. What is the frequency of occurrence of each amino acid within a protein?

6. What is the frequency of occurrence of each amino acid within a protein?

7. How do we know if a given protein sequence is
---------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


Output:
----------------------------------------------------------------------------------------------------
Come up with a series of tasks: 
1. Why is it important to know which cell a read came from?
2. What is your goal when processing a single cell?
---------
