In [1]:
import argparse
import math
from tqdm import tqdm
from Bio.SeqIO.FastaIO import Seq, SeqRecord
import lmdb

from typing import Union, List, Tuple, Sequence, Dict, Any, Optional, Collection
from copy import copy
from pathlib import Path
import pickle as pkl
import logging
import random

import json
import lmdb
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from scipy.spatial.distance import pdist, squareform


class LMDBDataset(Dataset):
    """Creates a dataset from an lmdb file.
    Args:
        data_file (Union[str, Path]): Path to lmdb file.
        in_memory (bool, optional): Whether to load the full dataset into memory.
            Default: False.
    """

    def __init__(self,
                 data_file: Union[str, Path],
                 in_memory: bool = False):

        data_file = Path(data_file)
        if not data_file.exists():
            raise FileNotFoundError(data_file)

        env = lmdb.open(str(data_file), max_readers=1, readonly=True,
                        lock=False, readahead=False, meminit=False)

        with env.begin(write=False) as txn:
            num_examples = pkl.loads(txn.get(b'num_examples'))

        if in_memory:
            cache = [None] * num_examples
            self._cache = cache

        self._env = env
        self._in_memory = in_memory
        self._num_examples = num_examples

    def __len__(self) -> int:
        return self._num_examples

    def __getitem__(self, index: int):
        if not 0 <= index < self._num_examples:
            raise IndexError(index)

        if self._in_memory and self._cache[index] is not None:
            item = self._cache[index]
        else:
            with self._env.begin(write=False) as txn:
                item = pkl.loads(txn.get(str(index).encode()))
                if 'id' not in item:
                    item['id'] = str(index)
                if self._in_memory:
                    self._cache[index] = item
        return item


In [20]:
# parser = argparse.ArgumentParser(description='Convert an lmdb file into a fasta file')
# parser.add_argument('lmdbfile', type=str, help='The lmdb file to convert')
# parser.add_argument('fastafile', type=str, help='The fasta file to output')
# args = parser.parse_args()
# lmdbfile = args.lmdbfile
# fastafile = args.fastafile

In [44]:
def calc(item):
    # protein_length = len(item['primary'])
    token_ids = item['primary']
    input_mask = np.ones_like(token_ids)
    valid_mask = item['valid_mask']
    contact_map = np.less(squareform(pdist(item['tertiary'])), 8.0).astype(np.int64)

    yind, xind = np.indices(contact_map.shape)
    invalid_mask = ~(valid_mask[:, None] & valid_mask[None, :])
    invalid_mask |= np.abs(yind - xind) < 6
    contact_map[invalid_mask] = -1

    return contact_map, squareform(pdist(item['tertiary']))

In [65]:
def process(task, split):
    Path(f'./output/{task}').mkdir(exist_ok=True)
    lmdbfile = f'./data/{task}/{task}_{split}.lmdb'
    fastafile = f'./output/{task}/{task}_{split}'
    dataset = LMDBDataset(lmdbfile)

    if not fastafile.endswith('.fasta'):
        fastafile += '.fasta'

    data = list()
    task_specific = list()
    id_fill = math.ceil(math.log10(len(dataset)))
    for i, element in enumerate(tqdm(dataset)):
        id_ = element.get('id', str(i).zfill(id_fill))
        print(id_)
        if isinstance(id_, bytes):
            id_ = id_.decode()
    
        primary = element['primary']
        seq = Seq(primary)
        print(element)
        data.append(element)

    with open(fastafile, 'w') as f:
        for idx, i in enumerate(data):
            seq = i['primary']
            if task == 'fluorescence':
                target = float(i['log_fluorescence'][0])
            elif task == 'stability':
                target = float(i['stability_score'][0])
            elif task == 'remote_homology':
                target = i['fold_label']
            elif task == 'proteinnet':
                target, dist_mat = calc(i)
            elif task == 'secondary_structure':
                target = np.asarray(i['ss3'], np.int64)
                target = np.pad(target, (1, 1), 'constant', constant_values=-1)
            task_specific.append([seq, target])
            # f.write(f">{str(idx).zfill(id_fill)}_{target}\n{seq}\n")
            f.write(f">{str(idx).zfill(id_fill)}\n{seq}\n")
    np.save(f'./output/labels/{task}_{split}', data)
    return task_specific, data

In [66]:
data, _ = process('fluorescence', 'test')

  2%|████▊                                                                                                                                                                                                                       | 589/27217 [00:00<00:09, 2908.31it/s]

0
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRDEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYDSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPVGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGTDEPYK', 'protein_length': 237, 'log_fluorescence': array([1.3010312], dtype=float32), 'num_mutations': 5, 'id': '0'}
1
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRDEVKFEGDTLVNRIELKGIDFKEDGNILGHKLENNYNSLNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLGFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3011891], dtype=float32), 'num_mutations': 4, 'id': '1'}
2
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRGEVKFEGDTLVNRIELKGIDFKEDGNILGHMLEYNYNSHNVYIMADKQKNGIKVNFKICHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDEPYK', 'protein_length':

  5%|██████████▏                                                                                                                                                                                                                | 1262/27217 [00:00<00:08, 3122.37it/s]


591
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDGGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADEQKNGIKVNFKIRHNIEDGSVQLADHYQQNFPIGDGPVLLPDNHYLCTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.4242625], dtype=float32), 'num_mutations': 4, 'id': '591'}
592
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDGGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQENGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDDHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDALYK', 'protein_length': 237, 'log_fluorescence': array([3.2611382], dtype=float32), 'num_mutations': 4, 'id': '592'}
593
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDGGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKQEYNYNSHSVYIMADKQKNGIKVNFKIRHNIGDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLPELVTAAGITHGMDELYK', 'prote

  7%|███████████████▋                                                                                                                                                                                                           | 1944/27217 [00:00<00:07, 3261.57it/s]

1270
{'primary': 'SKGEELFTGVVPILVELDGGVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIVFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIGDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGVTHGMDELCK', 'protein_length': 237, 'log_fluorescence': array([1.7027341], dtype=float32), 'num_mutations': 5, 'id': '1270'}
1271
{'primary': 'SKGEELFTGVVPILVELDGGVNGHKFSVSGEGEGEATYGKPTLKFICTTGKLPVPWPTLVATLSYDVQCFSRDPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSLNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDDHYLSTQSALGKDPNEERDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3011609], dtype=float32), 'num_mutations': 10, 'id': '1271'}
1272
{'primary': 'SKGEELFTGVVPILVELDGGVNGHKFSVSGEGEGGATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYTMADKQKNGTKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLESVTAAGITHGMDELYK', '

 10%|█████████████████████                                                                                                                                                                                                      | 2619/27217 [00:00<00:07, 3318.88it/s]

1962
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGNATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSCYPDHMKQHDFFKSAMPEGYVRERTIFLKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILEHKLGYNDNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.4497787], dtype=float32), 'num_mutations': 7, 'id': '1962'}
1963
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGNAAYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGSYKTRAEVKFEGDTLVNRIVLKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSLQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.7264707], dtype=float32), 'num_mutations': 5, 'id': '1963'}
1964
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGNASYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIVLKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQENGIKVNFKIRHDIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'p

 12%|██████████████████████████▍                                                                                                                                                                                                | 3286/27217 [00:01<00:07, 3332.88it/s]

2648
{'primary': 'SKGEELFTGVVPILVGLDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMMQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSRNVYIMADKQKNGIKVNFKIRHNIEDGSAQLADHYQQNTPIGDGPVLLPGNHYLSTQSALSRDPNEKRDHMVQLEFVTAAGITHGMDVLYK', 'protein_length': 237, 'log_fluorescence': array([1.6371038], dtype=float32), 'num_mutations': 8, 'id': '2648'}
2649
{'primary': 'SKGEELFTGVVPILVGLDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMRQHDFFKSAMPEGYVQERTIFFKGDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDASVQLADHYQQNTPIGDGPVLLPDDHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.1891537], dtype=float32), 'num_mutations': 5, 'id': '2649'}
2650
{'primary': 'SKGEELFTGVVPILVGLDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMRQHDFFKSAMPEGYVQERTILFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGKDELCK', 'p

 14%|███████████████████████████████▋                                                                                                                                                                                           | 3937/27217 [00:01<00:07, 3278.53it/s]

3325
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGGGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDLFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGFDFKEDGNILGHKLEYNYNSHNAYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.4425563], dtype=float32), 'num_mutations': 4, 'id': '3325'}
3326
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGGGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFSKSAMPEGYVQERTIFFKDDGNYKTRAEVKLEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMALLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.5806376], dtype=float32), 'num_mutations': 4, 'id': '3326'}
3327
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGGGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFSKSAMPEGYDQERTIFFKDDGNYKTRAEVKLEGDTLVNRIELKGIDFKEDGSILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALCEDPNEKRDHMVLLEFVTAAGITHGMDEPYK', 'p

 17%|████████████████████████████████████▉                                                                                                                                                                                      | 4588/27217 [00:01<00:06, 3248.48it/s]

{'primary': 'SKGGELFTGVEPILVELDGDVNGHEFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKYEGDTLVIRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSEDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3010637], dtype=float32), 'num_mutations': 6, 'id': '3979'}
3980
{'primary': 'SKGGELFTGVEPILVEQDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGDYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.5741676], dtype=float32), 'num_mutations': 4, 'id': '3980'}
3981
{'primary': 'SKGGELFTGVGPILVELDGDVNGHKFSVSGGGEGDATYGRLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDGGNYKTRAEVKFEGDTLVNRIELKGIVFKEDGNILGHKLEYNYNSHNVYIMAGKQSGIKVNFKIRLNIEDGSVQLADHYQQSTPIGDGPVLLPGNHYLSTQSALSIDPNEKRDHMVLLEFVTAAGITHGMDELDK', 'protein

 19%|██████████████████████████████████████████▏                                                                                                                                                                                | 5242/27217 [00:01<00:06, 3258.85it/s]

{'primary': 'SKGEGLFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGNVQERTIFFKDDGNYRTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHRLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPALLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3010302], dtype=float32), 'num_mutations': 5, 'id': '4631'}
4632
{'primary': 'SKGEKLFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFISTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFRSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTHIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.30107], dtype=float32), 'num_mutations': 4, 'id': '4632'}
4633
{'primary': 'SKGEKLFTGVVPILVELGGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLPYGVQCFSRYPGHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVDFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGIAHGMDELYK', 'protein_

 22%|███████████████████████████████████████████████▋                                                                                                                                                                           | 5919/27217 [00:01<00:06, 3315.34it/s]

{'primary': 'SKGEELFTGVVPILVELDGDVNGHKLSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMSEGYAQKRTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.2927632], dtype=float32), 'num_mutations': 4, 'id': '5302'}
5303
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKLSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMSEGFVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNTLGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGGVQLADHYQQNTPIGDGPVLLPDNHYPSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.568573], dtype=float32), 'num_mutations': 6, 'id': '5303'}
5304
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKLSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVLCFSRYPDHTKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVPLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHSMDELYK', 'protein

 24%|█████████████████████████████████████████████████████                                                                                                                                                                      | 6589/27217 [00:02<00:06, 3279.78it/s]

{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKSICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSATPEGYVQERTIFFKDDGNYKTRAEVEFEGDTLVNRIELKGIDFNEDGNILGHKLEYDYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHHLSTQSALSKDPNEKRDHVVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3010323], dtype=float32), 'num_mutations': 7, 'id': '5984'}
5985
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKSICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAVPEGYVQERTIFFKDNGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPSEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.5621992], dtype=float32), 'num_mutations': 4, 'id': '5985'}
5986
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKSICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGYYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNSKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKGPNEKRDHMVLLEFVTAAGITRGMDELYK', 'protei

 27%|██████████████████████████████████████████████████████████▎                                                                                                                                                                | 7245/27217 [00:02<00:06, 3257.26it/s]

6646
{'primary': 'SKGEELLTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCLSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIGFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALGKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.6240873], dtype=float32), 'num_mutations': 4, 'id': '6646'}
6647
{'primary': 'SKGEELLTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCLSRYPDHMKQHDFIKSAMPEGCVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYRQNTPIGDGPVLLPDNHYLSTQSAPSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.6101786], dtype=float32), 'num_mutations': 6, 'id': '6647'}
6648
{'primary': 'SKGEELLTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCLSRYPDYMKQHDFFKSAMPEGYVQERTIFSKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYRSTQSALSKDPNEKRDHMVLLEFVTAAGITHGVDELYK', 'p

 29%|███████████████████████████████████████████████████████████████▌                                                                                                                                                           | 7895/27217 [00:02<00:05, 3234.60it/s]

{'primary': 'SKGEELSTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGDYKTRAEVEFEGDTLVNRIELKGIYFKEDGNILGHKLEYNYNSHNVYTMADKQKDGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3071696], dtype=float32), 'num_mutations': 6, 'id': '7311'}
7312
{'primary': 'SKGEELSTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGSILGHKLEYNYNGHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPTGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.6018454], dtype=float32), 'num_mutations': 4, 'id': '7312'}
7313
{'primary': 'SKGEELSTGVVPILVELDGDVDGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYRTRAEVKFEGDSLVNRIELKGIDFKEGGNILGHKLEYNYNSHNVYIMADKRKNGIEVNFKIRHNIEDGSVQLADHYQQNTPFGDGPVLRPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protei

 31%|████████████████████████████████████████████████████████████████████▉                                                                                                                                                      | 8565/27217 [00:02<00:05, 3246.78it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 44%|███████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                                          | 11919/27217 [00:03<00:04, 3349.13it/s]

11333
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTVFFKDDGNYKTRAEVKFGGDTLVNRIELKGIDFKEDGNIPGHKLEYNYNSHNVYIMADKQKSGIKVNFKIRLNIEDGSVQLADHYQQNTPIGDGPVLLPDIHYLSTQSALSKDPNEKRDHMVLLEFVAAVGITHGMDGLYK', 'protein_length': 237, 'log_fluorescence': array([1.3010308], dtype=float32), 'num_mutations': 9, 'id': '11333'}
11334
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTVFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKADGNILGHKLEYNYNSHSVYIMADKPRNGIKVNFEIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.6167307], dtype=float32), 'num_mutations': 6, 'id': '11334'}
11335
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTVFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDSKEDGNILGHKLEYSYNSHSVYIMADKQMNGIKVNFEIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

 46%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                                     | 12601/27217 [00:03<00:04, 3366.25it/s]

12011
{'primary': 'SEGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVEFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKIGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKGPNEKRDRMVLLEFVTAAGITHGMDELFK', 'protein_length': 237, 'log_fluorescence': array([1.4701732], dtype=float32), 'num_mutations': 6, 'id': '12011'}
12012
{'primary': 'SEGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFEEDGNILGHKLEYNYNSHNVYIMADKLKNGIKVNFKIRHDIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEYVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.7000902], dtype=float32), 'num_mutations': 5, 'id': '12012'}
12013
{'primary': 'SEGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQRNGIKVNFKIRHNIEDGNVQPADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

 49%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                                               | 13298/27217 [00:04<00:04, 3425.42it/s]

12702
{'primary': 'SRGEELFTGVMPILVELDGDVNGHKFSVSGEGEGDATYGKLTLRFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAVPEGYVQERTIFSKDDGDYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHVVLLEFVTAAGITHGTDELYK', 'protein_length': 237, 'log_fluorescence': array([1.5518564], dtype=float32), 'num_mutations': 8, 'id': '12702'}
12703
{'primary': 'SRGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAELKFEGDALVNRIELKGIDFKEDGNILGHKLEYNYNSHSVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHRVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.4746188], dtype=float32), 'num_mutations': 5, 'id': '12703'}
12704
{'primary': 'SRGEELFTGVVPILAELDGDVNGHKFSVSGEGEGDATYGKLTLKFIRTTGKLPAPWPTLVTTLSYGVQCFSRYPDHMKQHDLFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

 51%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                                          | 13981/27217 [00:04<00:03, 3338.61it/s]

13397
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGELTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFEDDGNYKTRAEVKFEGDTLVNRVELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKTRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.4246664], dtype=float32), 'num_mutations': 4, 'id': '13397'}
13398
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGELTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFRDDGNYETRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.2339222], dtype=float32), 'num_mutations': 4, 'id': '13398'}
13399
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGELTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIEQKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNNEDGSVQLADHYQQNTPIGDGPVLLPDNRYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

 54%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                    | 14667/27217 [00:04<00:03, 3375.85it/s]

14060
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLRFICTTGKLPVPWPTLVTTLSYGVQCLSRYPDHMMQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYPSTQSALSKDPNEKRDHMVLLEFVTAAGTTHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.5516992], dtype=float32), 'num_mutations': 5, 'id': '14060'}
14061
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLRFICTTGKLPVPWPTLVTTLSYGVQCSSRYPDHMKQHDFFKSAMPGGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADEQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLGFVTAAGITRGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.30103], dtype=float32), 'num_mutations': 6, 'id': '14061'}
14062
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLRFICTTGKLPVPWPTLVTTLSYGVQCSSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQRNGIEVNFKIRHNIEDGSVQLADHYLQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK',

 56%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                               | 15362/27217 [00:04<00:03, 3366.12it/s]

14757
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMRQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAVVKFEGDTLVNRIELKGIDFKGDGNILAHKLEYNYNSRNVYIMADKQKNGIKVYFKIRHNIEDGSLQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.1540048], dtype=float32), 'num_mutations': 7, 'id': '14757'}
14758
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMRQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKKDGNILGHKLEYNYNSHNVYIMADEQKNGTKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMILLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.6912746], dtype=float32), 'num_mutations': 5, 'id': '14758'}
14759
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMRQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLGYNYNSHNVYIMADKQKNGIKVDFKIRHNIEDGSVQLAYHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVAAAGITHGMDELYK

 59%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                         | 16054/27217 [00:04<00:03, 3407.07it/s]

15448
{'primary': 'SKGEELFTGVVPIQVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHSIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK', 'protein_length': 236, 'log_fluorescence': array([3.1740057], dtype=float32), 'num_mutations': 4, 'id': '15448'}
15449
{'primary': 'SKGEELFTGVVPIQVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELRGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIENGSVQLADHYQQNTPVGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.6547544], dtype=float32), 'num_mutations': 4, 'id': '15449'}
15450
{'primary': 'SKGEELFTGVVPIQVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELRGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKFRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEYVTAAGITHGKDELYK'

 61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                    | 16729/27217 [00:05<00:03, 3303.88it/s]

16145
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKRTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFSKSAMPEGYVQERTIFFKDDGDYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVDFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.30103], dtype=float32), 'num_mutations': 4, 'id': '16145'}
16146
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKRTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFSKSAMPEGYVQERTIFFKDDGNYKTHAEVKFEGDTLVNRIEPKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3012462], dtype=float32), 'num_mutations': 4, 'id': '16146'}
16147
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKRTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTICFKDDGNYKTRAEVKFEGDTLVNRTELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQSTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLESVTAAGITNGKDELYK',

 64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                              | 17388/27217 [00:05<00:03, 3198.39it/s]

16796
{'primary': 'SKGEEPFTGVVPILVELDDDVNGHRFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLGTTLSYGVQCFSRYPDHMKQHDFFKPAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVIRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNLKIRHYIEDGSVRLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.4684049], dtype=float32), 'num_mutations': 9, 'id': '16796'}
16797
{'primary': 'SKGEEPFTGVVPILVELDGDVNGHKFSVSDEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIRVNFKIRHNIGDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3046042], dtype=float32), 'num_mutations': 4, 'id': '16797'}
16798
{'primary': 'SKGEEPFTGVVPILVELDGDVNGHKFSVSGEGEGDATYDKLTLKFFCTTGKLPVPWPALVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFLKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

 66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 18072/27217 [00:05<00:02, 3305.89it/s]

17427
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHLKQHGFFKSATPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDSHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.8561019], dtype=float32), 'num_mutations': 4, 'id': '17427'}
17428
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHLKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAVVKFEGDTLVNRTELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPTGDGPVLLPDNHYLSTQSAQSKDPNEKRDHMVLLEFVTAAGTTHGTDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3010299], dtype=float32), 'num_mutations': 7, 'id': '17428'}
17429
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHLKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIGLKGIDFKEDGSILGHKLEYNYNSHNVYIKADKQKNGIKVNFKTRHNIEDGSVQLADHHQQNTPIGDGPVLLPDNHYLSTQTALSKDPNEESDHMVLLEFVTAAGITHGMDELYK

 69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                   | 18760/27217 [00:05<00:02, 3370.53it/s]

18118
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYYSHNVYIVADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYQSTQSALSKDPNEKRDHIVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.6752479], dtype=float32), 'num_mutations': 4, 'id': '18118'}
18119
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYYSHNVYIMADKQKNGIKVDFKIRHNIGDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITPGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.6362205], dtype=float32), 'num_mutations': 4, 'id': '18119'}
18120
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHDVYFMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDRPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDVLHK

 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 19447/27217 [00:05<00:02, 3403.69it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 22884/27217 [00:06<00:01, 3417.19it/s]

22235
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPSLVTTLSYGVQCFSRYPDHMKQHDFFRSAMPEGYVQERTNFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDLKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIGDGSVQLADHYQQNTPIGDGPVLLPDDHYLSTQSALSKDPNEKRDHTVLLVFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.4526858], dtype=float32), 'num_mutations': 8, 'id': '22235'}
22236
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPSLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFEDDGNHKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPGLLPDNHYLSTQSALSKDPNEKRDHMVLLGFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.5667464], dtype=float32), 'num_mutations': 5, 'id': '22236'}
22237
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPSLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFNDDGNYKTRAEVKFEGDTLVSRIELKGIDFKEDGNILGHKLEYNYNSHYVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNYYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 23568/27217 [00:07<00:01, 3354.04it/s]


22927
{'primary': 'SKGEELFAGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVRCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLQPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDEPYK', 'protein_length': 237, 'log_fluorescence': array([1.5049438], dtype=float32), 'num_mutations': 4, 'id': '22927'}
22928
{'primary': 'SKGEELFAGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVRCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYDYHSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYPSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3010302], dtype=float32), 'num_mutations': 5, 'id': '22928'}
22929
{'primary': 'SKGEELFAGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFVGDTLVNRIELKGIDFKEDGNVLGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYQSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELY

 89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 24246/27217 [00:07<00:00, 3356.20it/s]

23597
{'primary': 'SKGEELFTGVAPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVATLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKYGIKVNFKIRHNIEDGSVQLADHYRQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3068138], dtype=float32), 'num_mutations': 4, 'id': '23597'}
23598
{'primary': 'SKGEELFTGVAPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVATLSYGVQCFSRHPDHMKQHDFFKTATPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHIIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3015873], dtype=float32), 'num_mutations': 6, 'id': '23598'}
23599
{'primary': 'SKGEELFTGVAPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVSTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKLEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLELVTAAGITHGMDELYK

 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 24921/27217 [00:07<00:00, 3354.05it/s]

24282
{'primary': 'SKGEELFTGVVPILIELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCSSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKRKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMALLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.8573325], dtype=float32), 'num_mutations': 4, 'id': '24282'}
24283
{'primary': 'SKGEELFTGVVPILIELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDLFKSAMPEGYVQERTIFFKGDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNIPGHKLEYNYNSHNVYIMPDKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.6574355], dtype=float32), 'num_mutations': 5, 'id': '24283'}
24284
{'primary': 'SKGEELFTGVVPILIELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTICFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQRNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMGELYK

 94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 25606/27217 [00:07<00:00, 3386.86it/s]

24963
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPAPWPTLVTTLSYGVQCISRYPDHMKQHDFFKSAMPEGYVQERTFFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPNGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVAAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.3692381], dtype=float32), 'num_mutations': 5, 'id': '24963'}
24964
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPAPWPTLVTTLSYGVQCLSRYPDHMKQHDFFKSAMPEGYVQERTIFSMDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYILADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.7426381], dtype=float32), 'num_mutations': 5, 'id': '24964'}
24965
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPAPWPTLVTTLSYGVQCLSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYMTRAEAKFEGDALVNRIELKSIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGISHGVDELYK

 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 25945/27217 [00:07<00:00, 3228.11it/s]

25659
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYIQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHYIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGTTRGMDELYK', 'protein_length': 237, 'log_fluorescence': array([3.5666826], dtype=float32), 'num_mutations': 4, 'id': '25659'}
25660
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYIQEHTIFFKDDGNYMTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIVADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPALLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.5912938], dtype=float32), 'num_mutations': 5, 'id': '25660'}
25661
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYIQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNNNSHNVYIMAGKQKNGIKVYFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK

 98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 26582/27217 [00:08<00:00, 3082.48it/s]

26223
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPRPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGFGFKEDGNILGHKLEYNYNSHNVYIMADKQKDGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITRGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.30103], dtype=float32), 'num_mutations': 5, 'id': '26223'}
26224
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPRPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTNFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQRNTPIGDGPVLLPDNQYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.7292032], dtype=float32), 'num_mutations': 4, 'id': '26224'}
26225
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPRPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTTFFKDDGNYKTRAEVKFEGDTLVNRIELEGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQAALSKDPNEKRDHMVLLEFATAAGITHGMDELYK',

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27217/27217 [00:08<00:00, 3307.52it/s]


26882
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSDGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVEFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNNHNVYIMADKQKDGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 237, 'log_fluorescence': array([1.2978274], dtype=float32), 'num_mutations': 4, 'id': '26882'}
26883
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSDGVQCFSRYPDHMKQHDFFSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRTELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKENFKIRHDIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK', 'protein_length': 236, 'log_fluorescence': array([1.7108358], dtype=float32), 'num_mutations': 5, 'id': '26883'}
26884
{'primary': 'SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSDGVQCFSRYPDHMKQHDFFKSAMPEGYVRERTIFFKDDGNYKTRAEVKFEGDTLVNRIEPKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLGTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK'

In [62]:
data

[['SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRDEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYDSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPVGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGTDEPYK',
  1.301031231880188],
 ['SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRDEVKFEGDTLVNRIELKGIDFKEDGNILGHKLENNYNSLNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLGFVTAAGITHGMDELYK',
  1.3011890649795532],
 ['SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRGEVKFEGDTLVNRIELKGIDFKEDGNILGHMLEYNYNSHNVYIMADKQKNGIKVNFKICHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDEPYK',
  3.582764148712158],
 ['SKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRGEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADEQKNGIKVNSKIRHNIEDGSVQLADHYQQNTPIGDGPVQPPDNHYLS

In [59]:
tasks = ['fluorescence', 'stability', 'proteinnet']
splits = ['train', 'valid', 'test']
for t in tasks:
    for s in splits:
        data, _ = process(t, s)
for s in ['train', 'valid', 'casp12', 'ts115', 'cb513']:
    data, _ = process('secondary_structure', s)

for s in ['train', 'valid', 'test_fold_holdout', 'test_family_holdout', 'test_superfamily_holdout']:
    data, _ = process('remote_homology', s)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21446/21446 [00:00<00:00, 107303.01it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5362/5362 [00:00<00:00, 106995.72it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27217/27217 [00:00<00:00, 111533.22it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

KeyboardInterrupt: 

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8678/8678 [01:06<00:00, 130.12it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2170/2170 [00:00<00:00, 27993.93it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 6652.60it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12312/12312 [00:42<00:00, 288.63it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 736/736 [00:00<00:00, 41117.34it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 718/718 [00:00<00:00, 45992.00it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████