In [17]:
import sqlite3
import pandas as pd
from tqdm import tqdm
import numpy as np
from tokenizers import Tokenizer
import json
from EfficientBPE.vocabulary_functions import get_mutated, get_parents, set_difference, set_intersection

## Load Tokenizers

In [161]:
# 'dataset': {'uniref50', 'uniref90'}
# 'is_pretokenizer': {True, False}
# 'subs_matrix': {'blosum45', 'blosum62', 'pam70', 'pam250'}
# 'mutation_cutoff': {0.7, 0.8, 0.9}
# 'min_mutation_freq': {0, 0.05,. 0.005}
# 'min_mutation_len': {3}
# 'max_mutation_len': {12}
# 'vocab_size': list=[800, 1600, 3200, 6400, 12800, 25600, 51200]

vocab_sizes = [800, 3200, 12800]
uniref_id = "50"

tokenizer_opts_list = [
    # {
    #     'is_mut': False,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': False,
    #     'vocab_size': vocab_sizes
    # },
    # {
    #     'is_mut': False,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': True,
    #     'vocab_size': vocab_sizes
    # },
    # {
    #     'is_mut': True,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': False,
    #     'subs_matrix': 'blosum62',
    #     'mutation_cutoff': 0.7,
    #     'min_mutation_freq': 0,
    #     'min_mutation_len': 3,
    #     'max_mutation_len': 12,
    #     'vocab_size': vocab_sizes
    # },
    # {
    #     'is_mut': True,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': False,
    #     'subs_matrix': 'blosum62',
    #     'mutation_cutoff': 0.9,
    #     'min_mutation_freq': 0,
    #     'min_mutation_len': 3,
    #     'max_mutation_len': 12,
    #     'vocab_size': vocab_sizes
    # },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': False,
        'subs_matrix': 'blosum62',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': False,
        'subs_matrix': 'pam70',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': True,
        'subs_matrix': 'blosum62',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
]

In [162]:
def generate_tokenizer_name(tokenizer_opts, vocab_size):
    if tokenizer_opts['is_mut']:
        tokenizer_name = f"mutBPE{' pre' if tokenizer_opts['is_pretokenizer'] else ''} {tokenizer_opts['subs_matrix']} {tokenizer_opts['mutation_cutoff']} {tokenizer_opts['min_mutation_freq']} {vocab_size}"
    else:
        tokenizer_name = f"stdBPE{' pre' if tokenizer_opts['is_pretokenizer'] else ''} {vocab_size}"
    return tokenizer_name

def generate_tokenizer_filename(tokenizer_opts, vocab_size):
    if tokenizer_opts['is_mut']:
        file_name = f"{tokenizer_opts['dataset']}{'pre' if tokenizer_opts['is_pretokenizer'] else ''}_mutbpe_{tokenizer_opts['mutation_cutoff']}_{tokenizer_opts['min_mutation_len']}"
        file_name += f"_{tokenizer_opts['max_mutation_len']}_{tokenizer_opts['min_mutation_freq']}_{vocab_size}"
    else:
        file_name = f"{tokenizer_opts['dataset']}{'pre' if tokenizer_opts['is_pretokenizer'] else ''}_bpe_{vocab_size}"
    return file_name


def load_tokenizer(tokenizer_opts, folder_path = "/cta/share/users/mutbpe/tokenizers", hf_or_vocab = 'hf'):
    tokenizer_list = {}
    if tokenizer_opts['is_mut']:
        for vocab_size in tokenizer_opts['vocab_size']:
            tokenizer_name = generate_tokenizer_name(tokenizer_opts, vocab_size)
            file_name = generate_tokenizer_filename(tokenizer_opts, vocab_size)
            if hf_or_vocab == 'hf':
                file_path = f"{folder_path}/{tokenizer_opts['subs_matrix']}/hf_{file_name}"
                tokenizer_list[tokenizer_name] = Tokenizer.from_file(f"{file_path}.json")
            else:
                file_path = f"{folder_path}/{tokenizer_opts['subs_matrix']}/{file_name}"
                with open(f"{file_path}.json") as json_file:
                    tokenizer_list[tokenizer_name] = json.load(json_file)
                
    else:
        for vocab_size in tokenizer_opts['vocab_size']:
            tokenizer_name = generate_tokenizer_name(tokenizer_opts, vocab_size)
            file_name = generate_tokenizer_filename(tokenizer_opts, vocab_size)
            if hf_or_vocab == 'hf':
                file_path = f"{folder_path}/{'blosum62'}/hf_{file_name}"
                tokenizer_list[tokenizer_name] = Tokenizer.from_file(f"{file_path}.json")
            else:
                file_path = f"{folder_path}/{'blosum62'}/{file_name}"
                with open(f"{file_path}.json") as json_file:
                    tokenizer_list[tokenizer_name] = json.load(json_file)

    return tokenizer_list

def load_tokenizers(tokenizer_opts_list, hf_or_vocab = 'hf'):
    tokenizer_list = {}
    for tokenizer_opts in tokenizer_opts_list:
        tokenizer_list.update(load_tokenizer(tokenizer_opts, hf_or_vocab=hf_or_vocab))
    return tokenizer_list

tokenizer_list = load_tokenizers(tokenizer_opts_list, 'hf')
inner_vocab_list = load_tokenizers(tokenizer_opts_list, 'vocab')

vocab_list = {}
for name, tokenizer in tokenizer_list.items():
    vocab_list[name] = list(set([token for token, idx in tokenizer.get_vocab().items()]))

## Read Datasets

In [163]:
# Connect to DB
db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)

df_protein = pd.read_sql(f"""SELECT Entry as uniprot_id, Sequence as sequence
                          FROM proteins
                          WHERE Entry IN (SELECT uniprot_accession FROM uniref{uniref_id}_distilled)""", conn)
df_protein = df_protein[df_protein['sequence'].str.len() < 3000]

# df_protein_pre = pd.read_sql(f"SELECT * FROM uniref{uniref_id}_domain_sliced_plddt70", conn)
# df_protein_pre = df_protein_pre[~df_protein_pre['uniprot_id'].isin(df_protein[df_protein['sequence'].str.len() > 3000]['uniprot_id'].unique())]

df_interpro_domain = pd.read_sql(f"SELECT uniprot_id, interpro_id as source, start_index, end_index FROM interpro_entries_v2 WHERE type='domain'", conn)
df_ted = pd.read_sql(f"SELECT uniprot_id, ted_id as source, start_index, end_index FROM ted_entries_summary WHERE plddt >= 70", conn)

conn.close()

In [181]:
df_domains = pd.concat([df_interpro_domain, df_ted])

# Find uniprot_ids that have "interpro" as a source
interpro_ids = df_domains.loc[df_domains["source"].str.startswith("IPR"), "uniprot_id"].unique()

# Filter the DataFrame to exclude rows with source "ted" for those uniprot_ids
df_domains = df_domains[~((df_domains["uniprot_id"].isin(interpro_ids)) & (df_domains["source"].str.startswith("AF")))]

df_domains = df_protein.set_index('uniprot_id').join(df_domains.set_index('uniprot_id'), how='inner').reset_index()

df_domains['domain_sequence'] = df_domains.apply(lambda row: row['sequence'][row['start_index']-1: row['end_index']], axis=1)

df_domains = df_domains[df_domains['domain_sequence'].str.len()>0]

df_domains = df_domains[df_domains['source'].str.startswith('IPR')] # just keep interpro entries

df_domains = df_domains[['uniprot_id', 'source', 'domain_sequence']].reset_index(drop=True)

df_domains

Unnamed: 0,uniprot_id,source,domain_sequence
0,A0A087X296,IPR000742,PVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCT
1,A0A0K2S4Q6,IPR003599,PSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETGG...
2,A0A0K2S4Q6,IPR007110,PGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIW...
3,A0A0K2S4Q6,IPR013106,GPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETG...
4,A0A3B3ISZ0,IPR001206,AQVKKATVFLNPAACKGKARTLFEKNAAPILHLSGMDVTIVKTDYE...
...,...,...,...
99798,X6RHN7,IPR028889,KGLSNEPGQNSCFLNSALQVLWHLDIFRRSFRQLTTHKCMGDSCIF...
99799,X6RIL1,IPR005302,RPRRPHQIADLFRPKDQIAYSDTSPFLILSEASLADLNSRLEKKVK...
99800,X6RK39,IPR025946,LPRVLRVCSGVYFEGSIYEISGNECCLSTGDLIKVTQVRLQKVVCE...
99801,X6RK39,IPR025946,ILEVPEGRPIFLSPWVGSLQKGQRLCVYGLASPPWRVLASSKGRKV...


In [182]:
for name, tokenizer in tqdm(list(tokenizer_list.items())):
    df_domains[name] = [enc.tokens for enc in tokenizer.encode_batch(df_domains['domain_sequence'])]

100%|██████████| 9/9 [00:11<00:00,  1.25s/it]


In [183]:
df_domains.head()

Unnamed: 0,uniprot_id,source,domain_sequence,mutBPE blosum62 0.7 0.05 800,mutBPE blosum62 0.7 0.05 3200,mutBPE blosum62 0.7 0.05 12800,mutBPE pam70 0.7 0.05 800,mutBPE pam70 0.7 0.05 3200,mutBPE pam70 0.7 0.05 12800,mutBPE pre blosum62 0.7 0.05 800,mutBPE pre blosum62 0.7 0.05 3200,mutBPE pre blosum62 0.7 0.05 12800
0,A0A087X296,IPR000742,PVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCT,"[PV, N, PC, C, YY, PC, QH, QG, I, CV, RF, GL, ...","[PV, N, PC, C, YY, PC, QH, QGI, CV, RF, GLD, R...","[PV, NPC, C, YY, PC, QH, QGI, CV, RF, GLD, RY,...","[PV, N, PC, C, YY, PC, QH, QG, I, CV, RF, GL, ...","[PV, N, PC, C, YY, PC, QH, QGI, CV, RF, GLD, R...","[PV, NPC, CYY, PC, QH, QGI, CV, RF, GLD, RY, Q...","[PV, N, PC, C, YY, PC, QH, QG, I, CV, RF, GL, ...","[PV, N, PC, C, YY, PC, QH, QGI, CV, RF, GLD, R...","[PV, NPC, C, YY, PC, QH, QGI, CV, RF, GLD, RY,..."
1,A0A0K2S4Q6,IPR003599,PSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETGG...,"[PST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, KT,...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, V, MG, AV, G, ESL, SV, QC, RY, EE, KY, K...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, KT,...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F..."
2,A0A0K2S4Q6,IPR007110,PGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIW...,"[PG, CL, TV, SG, PST, V, MG, AVG, ESL, SV, QC,...","[PG, CL, TV, SG, PST, VMG, AVG, ESL, SV, QC, R...","[PG, CL, TVSG, PST, VMG, AVG, ESL, SV, QC, RY,...","[PG, CL, TV, SG, PST, V, MG, AV, G, ESL, SV, Q...","[PG, CL, TV, SG, PST, VMG, AVG, ESL, SV, QC, R...","[PG, CL, TVSG, PST, VMG, AVG, ESL, SV, QC, RY,...","[PG, CL, TV, SG, PST, V, MG, AVG, ESL, SV, QC,...","[PG, CL, TV, SG, PST, VMG, AVG, ESL, SV, QC, R...","[PG, CL, TVSG, PST, VMG, AVG, ESL, SV, QC, RY,..."
3,A0A0K2S4Q6,IPR013106,GPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETG...,"[GP, ST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, ...","[GP, STV, MG, AVG, ESL, SV, QC, RY, EE, KY, KT...","[GP, STV, MG, AVG, ESL, SV, QC, RY, EE, KY, KT...","[G, PST, V, MG, AV, G, ESL, SV, QC, RY, EE, KY...","[G, PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT...","[GPST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, ...","[GP, ST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, ...","[GP, STV, MG, AVG, ESL, SV, QC, RY, EE, KY, KT...","[GP, STV, MG, AVG, ESL, SV, QC, RY, EE, KY, KT..."
4,A0A3B3ISZ0,IPR001206,AQVKKATVFLNPAACKGKARTLFEKNAAPILHLSGMDVTIVKTDYE...,"[A, QV, KK, A, TV, FL, NP, AA, C, KG, KA, R, T...","[AQV, KK, ATV, FL, NP, AAC, KG, KA, RTL, F, EK...","[AQV, KK, ATV, FL, NP, AAC, KG, KA, RTLF, EK, ...","[A, QV, KK, ATV, FL, NP, AA, C, KG, KA, RTL, F...","[AQV, KK, ATV, FL, NP, AA, CKG, KA, RTL, FEK, ...","[AQV, KK, ATV, FL, NPAA, CKG, KA, RTL, FEK, NA...","[A, QV, KK, A, TV, FL, NP, AA, C, KG, KA, R, T...","[AQV, KK, ATV, FL, NP, AA, C, KG, KA, RTL, FEK...","[AQV, KK, ATV, FL, NP, AAC, KG, KA, RTL, FEK, ..."


In [184]:
inner_vocab_parents_list = {}
inner_vocab_mutated_list = {}
inner_vocab_family_list = {}
for k, v in inner_vocab_list.items():
    inner_vocab_parents_list[k] = get_parents(v)
    inner_vocab_mutated_list[k] = get_mutated(v)
    inner_vocab_family_list[k] = {p:0 for p in inner_vocab_parents_list[k].keys()}

In [185]:
def families_in_domains(row):
    family_counts_dict = {}
    for method_name in tokenizer_list.keys():
        # family_counts = inner_vocab_family_list[method_name].copy()
        family_counts = {}
        for token in row[method_name]:
            if token in inner_vocab_parents_list[method_name]:
                family_counts[token] = family_counts.get(token, 0) + 1
            if token in inner_vocab_mutated_list[method_name]:
                parent = inner_vocab_mutated_list[method_name][token]['parent']
                family_counts[parent] = family_counts.get(parent, 0) + 1
        family_counts_dict[f"{method_name} families"] = family_counts
    return family_counts_dict

In [186]:
df_domains_family_counts = pd.DataFrame.from_dict(list(df_domains.apply(families_in_domains, axis=1)))

In [189]:
df_domains_family_counts = pd.concat([df_domains.loc[:, :'domain_sequence'], df_domains_family_counts], axis=1)

In [202]:
df_domains_family_counts.iloc[:5, :5]

Unnamed: 0,uniprot_id,source,domain_sequence,mutBPE blosum62 0.7 0.05 800 families,mutBPE blosum62 0.7 0.05 3200 families
0,A0A087X296,IPR000742,PVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCT,{},"{'EGV': 1, 'SLD': 1}"
1,A0A0K2S4Q6,IPR003599,PSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETGG...,"{'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'..."
2,A0A0K2S4Q6,IPR007110,PGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIW...,"{'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'..."
3,A0A0K2S4Q6,IPR013106,GPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETG...,"{'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL': 1}","{'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1, 'EVV'..."
4,A0A3B3ISZ0,IPR001206,AQVKKATVFLNPAACKGKARTLFEKNAAPILHLSGMDVTIVKTDYE...,"{'SAA': 1, 'PLL': 1, 'SGL': 1, 'ELL': 1, 'SET'...","{'SQV': 1, 'STV': 1, 'AAC': 1, 'RTL': 1, 'SAA'..."


In [271]:
import pandas as pd
from collections import Counter

# Input DataFrame
data = {
    'uniprot_id': ['A1', 'A2', 'A3', 'A4', 'A5', 'A6'],
    'source': ['IPR000742', 'IPR003599', 'IPR013106', 'IPR003599', 'IPR013106', 'IPR001206'],
    'method_1': [
        {},
        {'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1},
        {'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1},
        {'PSS': 1, 'AAG': 1, 'EAL': 1},
        {'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL': 1},
        {'SAA': 1, 'PLL': 1, 'SGL': 1, 'ELL': 1}
    ],
    'method_2': [
        {'EGV': 1, 'SLD': 1},
        {'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1},
        {'PSS': 1, 'VVG': 1, 'EAL': 1},
        {'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1},
        {'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1},
        {'SQV': 1, 'STV': 1, 'AAC': 1, 'RTL': 1}
    ]
}
df = pd.DataFrame(data)

# Function to sum dictionaries
def sum_dicts(dicts):
    return dict(sum((Counter(d) for d in dicts), Counter()))

# Function to sum and sort dictionaries
def sum_and_sort_dicts(dicts):
    summed = sum((Counter(d) for d in dicts), Counter())
    # Sort by values in descending order and return as a dictionary
    return dict(sorted(summed.items(), key=lambda item: item[1], reverse=True))

# Group by 'source' and sum/sort the dictionaries
result = df.groupby('source').agg({
    'method_1': sum_and_sort_dicts,
    'method_2': sum_and_sort_dicts
}).reset_index()

result


Unnamed: 0,source,method_1,method_2
0,IPR000742,{},"{'EGV': 1, 'SLD': 1}"
1,IPR001206,"{'SAA': 1, 'PLL': 1, 'SGL': 1, 'ELL': 1}","{'SQV': 1, 'STV': 1, 'AAC': 1, 'RTL': 1}"
2,IPR003599,"{'PSS': 2, 'AAG': 2, 'EAL': 2, 'SSG': 1}","{'PSS': 2, 'VVG': 2, 'AAG': 2, 'EAL': 2}"
3,IPR013106,"{'AAG': 2, 'EAL': 2, 'SSG': 2, 'PSS': 1, 'ENL'...","{'EAL': 2, 'PSS': 1, 'VVG': 1, 'STV': 1, 'AAG'..."


In [230]:
df.groupby('source').agg({
    'uniprot_id': len,
    'method_1': sum_and_sort_dicts,
    'method_2': sum_and_sort_dicts
})

Unnamed: 0_level_0,uniprot_id,method_1,method_2
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IPR000742,1,{},"{'EGV': 1, 'SLD': 1}"
IPR001206,1,"{'SAA': 1, 'PLL': 1, 'SGL': 1, 'ELL': 1}","{'SQV': 1, 'STV': 1, 'AAC': 1, 'RTL': 1}"
IPR003599,2,"{'PSS': 2, 'AAG': 2, 'EAL': 2, 'SSG': 1}","{'PSS': 2, 'VVG': 2, 'AAG': 2, 'EAL': 2}"
IPR013106,2,"{'AAG': 2, 'EAL': 2, 'SSG': 2, 'PSS': 1, 'ENL'...","{'EAL': 2, 'PSS': 1, 'VVG': 1, 'STV': 1, 'AAG'..."


In [211]:
tokenizer_list.keys()

dict_keys(['mutBPE blosum62 0.7 0.05 800', 'mutBPE blosum62 0.7 0.05 3200', 'mutBPE blosum62 0.7 0.05 12800', 'mutBPE pam70 0.7 0.05 800', 'mutBPE pam70 0.7 0.05 3200', 'mutBPE pam70 0.7 0.05 12800', 'mutBPE pre blosum62 0.7 0.05 800', 'mutBPE pre blosum62 0.7 0.05 3200', 'mutBPE pre blosum62 0.7 0.05 12800'])

In [208]:
df_domains_family_counts.head()

Unnamed: 0,uniprot_id,source,domain_sequence,mutBPE blosum62 0.7 0.05 800 families,mutBPE blosum62 0.7 0.05 3200 families,mutBPE blosum62 0.7 0.05 12800 families,mutBPE pam70 0.7 0.05 800 families,mutBPE pam70 0.7 0.05 3200 families,mutBPE pam70 0.7 0.05 12800 families,mutBPE pre blosum62 0.7 0.05 800 families,mutBPE pre blosum62 0.7 0.05 3200 families,mutBPE pre blosum62 0.7 0.05 12800 families
0,A0A087X296,IPR000742,PVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCT,{},"{'EGV': 1, 'SLD': 1}","{'EPC': 1, 'EGV': 1, 'SLD': 1, 'AGY': 1}",{},"{'QGV': 1, 'SLD': 1, 'CSR': 1, 'YSG': 1}","{'KPC': 1, 'CFY': 1, 'QGV': 1, 'SLD': 1, 'CSR'...",{},"{'EGV': 1, 'SLD': 1, 'YSG': 1}","{'EPC': 1, 'EGV': 1, 'SLD': 1, 'YSG': 1}"
1,A0A0K2S4Q6,IPR003599,PSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETGG...,"{'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'...","{'PSS': 1, 'EAL': 1, 'SCL': 1, 'SEG': 1, 'ENL'...","{'PSS': 1, 'VLG': 1, 'AVG': 1, 'EAL': 1, 'SCL'...","{'PSS': 1, 'VLG': 1, 'AVG': 1, 'EAL': 1, 'SCL'...","{'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'..."
2,A0A0K2S4Q6,IPR007110,PGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIW...,"{'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'...","{'TASG': 1, 'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL...","{'PSS': 1, 'EAL': 1, 'SCL': 1, 'SEG': 1, 'ENL'...","{'PSS': 1, 'VLG': 1, 'AVG': 1, 'EAL': 1, 'SCL'...","{'TVSS': 1, 'PSS': 1, 'VLG': 1, 'AVG': 1, 'EAL...","{'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'...","{'TVSS': 1, 'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL..."
3,A0A0K2S4Q6,IPR013106,GPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETG...,"{'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL': 1}","{'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1, 'EVV'...","{'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1, 'EVV'...","{'PSS': 1, 'EAL': 1, 'SCL': 1, 'SEG': 1, 'ENL'...","{'PSS': 1, 'VLG': 1, 'AVG': 1, 'EAL': 1, 'SCL'...","{'SPST': 1, 'VLG': 1, 'AVG': 1, 'EAL': 1, 'SCL...","{'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL': 1}","{'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1, 'EVV'...","{'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1, 'EVV'..."
4,A0A3B3ISZ0,IPR001206,AQVKKATVFLNPAACKGKARTLFEKNAAPILHLSGMDVTIVKTDYE...,"{'SAA': 1, 'PLL': 1, 'SGL': 1, 'ELL': 1, 'SET'...","{'SQV': 1, 'STV': 1, 'AAC': 1, 'RTL': 1, 'SAA'...","{'SQV': 1, 'STV': 1, 'AAC': 1, 'RTLL': 1, 'SAA...","{'AAV': 1, 'RAL': 1, 'SAA': 1, 'SGL': 1, 'ELL'...","{'TQV': 1, 'AAV': 1, 'CRG': 1, 'RAL': 1, 'FEK'...","{'TQV': 1, 'AAV': 1, 'TPAA': 1, 'CRG': 1, 'RAL...","{'SAA': 1, 'PLL': 1, 'SGL': 1, 'ELL': 1, 'SET'...","{'SQV': 1, 'STV': 1, 'RTL': 1, 'FEK': 1, 'SAA'...","{'SQV': 1, 'STV': 1, 'AAC': 1, 'RTL': 1, 'FEK'..."


In [None]:
{method_name:sum_and_sort_dicts for method_name in tokenizer_list.keys()}

In [235]:
agg_dict = {'uniprot_id': len}
agg_dict.update({f'{method_name} families':sum_and_sort_dicts for method_name in tokenizer_list.keys()})

df_domains_family_counts_agg = df_domains_family_counts.groupby('source').agg(agg_dict).reset_index()

In [261]:
df_domains_family_counts[df_domains_family_counts['source'] == 'IPR001881']

Unnamed: 0,uniprot_id,source,domain_sequence,mutBPE blosum62 0.7 0.05 800 families,mutBPE blosum62 0.7 0.05 3200 families,mutBPE blosum62 0.7 0.05 12800 families,mutBPE pam70 0.7 0.05 800 families,mutBPE pam70 0.7 0.05 3200 families,mutBPE pam70 0.7 0.05 12800 families,mutBPE pre blosum62 0.7 0.05 800 families,mutBPE pre blosum62 0.7 0.05 3200 families,mutBPE pre blosum62 0.7 0.05 12800 families
535,O00187,IPR001881,DIDECQVAPGEAPTCDHHCHNHLGGFYCSCRAGYVLHRNKRTCS,"{'SPG': 1, 'SHL': 1}","{'SPG': 1, 'SHL': 1, 'RAG': 1, 'YVL': 1, 'HRR'...","{'DEC': 1, 'SPG': 1, 'SHL': 1, 'FFC': 1, 'RAG'...","{'SPG': 1, 'SHL': 1}","{'SPG': 1, 'SHL': 1, 'RGG': 1, 'VLQ': 1, 'RKK'...","{'DEC': 1, 'SPG': 1, 'SHL': 1, 'CSC': 1, 'RGG'...","{'SPG': 1, 'SHL': 1}","{'DEA': 1, 'SPG': 1, 'SHL': 1, 'RAG': 1, 'YVL'...","{'DIDEC': 1, 'SPG': 1, 'SHL': 1, 'FFC': 1, 'RA..."
671,O00339,IPR001881,CSTLEHNCAHFCINIPGSYVCRCKQGYILNSDQTTCR,{'SSL': 1},"{'SSL': 1, 'KEG': 1, 'YVL': 1, 'NSS': 1}","{'SSL': 1, 'SCA': 1, 'KEG': 1, 'YVL': 1, 'NSS'...","{'SSL': 1, 'SSD': 1}","{'SSL': 1, 'FCL': 1, 'KQG': 1, 'YVL': 1, 'SSD'...","{'SSL': 1, 'CPH': 1, 'FCL': 1, 'KQG': 1, 'YVL'...",{'SSL': 1},"{'SSL': 1, 'KEG': 1, 'YVL': 1, 'NSS': 1}","{'SSL': 1, 'FCL': 1, 'KEG': 1, 'YVL': 1, 'NSS'..."
672,O00339,IPR001881,DHNCEQLCVNVPGSFVCQCYSGYALAEDGKRCV,"{'EEL': 1, 'SFL': 1}","{'EEL': 1, 'SFL': 1, 'SSY': 1, 'SLA': 1, 'EDG'...","{'EEL': 1, 'SFL': 1, 'SSY': 1, 'SLA': 1, 'EDG'...",{'EEL': 1},"{'EEL': 1, 'SFV': 1, 'SSY': 1, 'SLA': 1, 'EDG'...","{'EEL': 1, 'SFV': 1, 'SSY': 1, 'SLA': 1, 'EDG'...","{'EEL': 1, 'SFL': 1}","{'EEL': 1, 'SFL': 1, 'YSG': 1, 'SLA': 1, 'EDG'...","{'EEL': 1, 'SFL': 1, 'AQC': 1, 'YSG': 1, 'SLA'..."
673,O00339,IPR001881,AVDYCASENHGCEHECVNADGSYLCQCHEGFALNPDKKTCT,{'SFL': 1},"{'SEN': 1, 'ECV': 1, 'SFL': 1, 'FSL': 1, 'EPD'...","{'SEN': 1, 'QGC': 1, 'ECV': 1, 'SADG': 1, 'SFL...","{'SEN': 1, 'SFL': 1}","{'SEN': 1, 'ECL': 1, 'SFL': 1, 'FSL': 1, 'NPD'...","{'SEN': 1, 'HSC': 1, 'ECL': 1, 'DADG': 1, 'SFL...",{'SFL': 1},"{'SEN': 1, 'ECV': 1, 'SFL': 1, 'FSL': 1, 'EPD'...","{'SEN': 1, 'QGC': 1, 'ECV': 1, 'SFL': 1, 'AQC'..."
674,O00339,IPR001881,KIDYCASSNHGCQHECVNTDDSYSCHCLKGFTLNPDKKTCR,{},"{'SHG': 1, 'ECV': 1, 'RCL': 1, 'KAF': 1, 'EPD'...","{'SHG': 1, 'ECV': 1, 'RCL': 1, 'KAF': 1, 'EPD'...",{'SSD': 1},"{'SSD': 1, 'ECL': 1, 'FSL': 1, 'NPD': 1}","{'SSD': 1, 'CQQ': 1, 'ECL': 1, 'HCL': 1, 'FSL'...",{},"{'SHG': 1, 'ECV': 1, 'RCL': 1, 'FSL': 1, 'EPD'...","{'SHG': 1, 'ECV': 1, 'RCL': 1, 'FSL': 1, 'EPD'..."
...,...,...,...,...,...,...,...,...,...,...,...,...
98615,Q9NP01,IPR001881,DIDECEDNPNICDGGQCTNIPGEYRCLCYDGFMASEDMKTCV,{'SEE': 1},"{'SNV': 1, 'RCL': 1, 'FLA': 1, 'SEE': 1}","{'DEC': 1, 'SNV': 1, 'RCL': 1, 'FLA': 1, 'SEDL...",{'SEE': 1},"{'TSI': 1, 'RCL': 1, 'SEE': 1}","{'DEC': 1, 'TSI': 1, 'RCL': 1, 'FMA': 1, 'SEE'...","{'SGG': 1, 'SEE': 1}","{'DEA': 1, 'SLC': 1, 'SGG': 1, 'SNV': 1, 'RCL'...","{'DIDEC': 1, 'SLC': 1, 'SGG': 1, 'SNV': 1, 'RC..."
98616,Q9NP01,IPR001881,DVNECDLNPNICLSGTCENTKGSFICHCDMGYS,{'SFL': 1},"{'TEG': 1, 'SFL': 1, 'DVG': 1}","{'DEC': 1, 'TEG': 1, 'SFL': 1, 'DVG': 1}",{'SFL': 1},"{'SKG': 1, 'SFL': 1}","{'DEC': 1, 'SKG': 1, 'SFL': 1, 'MSY': 1}",{'SFL': 1},"{'TEG': 1, 'SFL': 1, 'DVG': 1}","{'DIDEC': 1, 'TEG': 1, 'SFL': 1, 'DVG': 1}"
98895,Q9UF98,IPR001881,AINACEISNGGCSAKADCKRTTPGRRVCTCKAGYTGDGIVCL,{},"{'CSS': 1, 'EAG': 1}","{'EAC': 1, 'SRGG': 1, 'CSS': 1, 'SSPG': 1, 'EA...",{},"{'CEV': 1, 'CSS': 1, 'KAG': 1, 'YSG': 1}","{'CEV': 1, 'SSGG': 1, 'CSS': 1, 'SSPG': 1, 'KA...",{},"{'CSS': 1, 'EAG': 1, 'YSG': 1}","{'EAC': 1, 'SRGG': 1, 'CSS': 1, 'PSRR': 1, 'EA..."
98896,Q9UF98,IPR001881,EINPCLENHGGCDKNAECTQTGPNQAACNCLPAYTGDGKVCT,"{'EAA': 1, 'SCL': 1}","{'EGG': 1, 'KSA': 1, 'EAA': 1, 'SCL': 1, 'PSY'...","{'EGG': 1, 'KSA': 1, 'EAA': 1, 'SCL': 1, 'PSY'...","{'EAA': 1, 'SCL': 1}","{'HGG': 1, 'EAA': 1, 'SCL': 1, 'YSG': 1}","{'HGG': 1, 'TEC': 1, 'EAA': 1, 'SCL': 1, 'YSG'...",{'EAA': 1},"{'RGG': 1, 'KSA': 1, 'EAA': 1, 'SCL': 1, 'YSG'...","{'RGG': 1, 'KSA': 1, 'EAA': 1, 'SCL': 1, 'YSG'..."


In [258]:
df_domains_family_counts_agg.sort_values('uniprot_id', ascending=False)

Unnamed: 0,source,uniprot_id,mutBPE blosum62 0.7 0.05 800 families,mutBPE blosum62 0.7 0.05 3200 families,mutBPE blosum62 0.7 0.05 12800 families,mutBPE pam70 0.7 0.05 800 families,mutBPE pam70 0.7 0.05 3200 families,mutBPE pam70 0.7 0.05 12800 families,mutBPE pre blosum62 0.7 0.05 800 families,mutBPE pre blosum62 0.7 0.05 3200 families,mutBPE pre blosum62 0.7 0.05 12800 families
1653,IPR013087,9346,"{'HTG': 4111, 'SSL': 1319, 'SHL': 728, 'SLL': ...","{'HTG': 4111, 'ESP': 3885, 'CGKAF': 2627, 'CGK...","{'ESP': 3885, 'HTG': 3485, 'FQC': 2490, 'CGKAF...","{'HTG': 3963, 'SSL': 1273, 'SHL': 572, 'SEL': ...","{'HTG': 3963, 'EKA': 3902, 'CGKA': 3129, 'SSL'...","{'EKA': 3902, 'HTG': 3113, 'YEC': 1702, 'CGKA'...","{'EKP': 4898, 'HTG': 3852, 'SSL': 1337, 'SHL':...","{'YKC': 4323, 'HTGEKP': 2851, 'KSF': 2048, 'CG...","{'YKC': 2678, 'CGKAF': 1373, 'CGKA': 1262, 'YK..."
140,IPR000742,2581,"{'SCL': 671, 'SPG': 413, 'SSG': 280, 'SGL': 23...","{'SCL': 671, 'SPG': 413, 'ECV': 409, 'EGG': 36...","{'SCL': 665, 'DEC': 576, 'ECV': 409, 'SPG': 38...","{'SGG': 415, 'SCL': 395, 'SPG': 295, 'SEG': 25...","{'SGG': 415, 'SCL': 395, 'RCL': 315, 'SPG': 29...","{'DEC': 583, 'SGG': 390, 'SCL': 376, 'RCL': 31...","{'SPG': 439, 'SGG': 302, 'SSG': 266, 'SGL': 23...","{'SCL': 668, 'SPG': 439, 'DEA': 397, 'ECV': 38...","{'SCL': 661, 'DIDEC': 472, 'SPG': 410, 'ECV': ..."
1084,IPR007110,2499,"{'SSG': 985, 'SGL': 970, 'SSL': 863, 'PSL': 53...","{'SSG': 982, 'SGL': 970, 'SSL': 840, 'PSL': 53...","{'SGL': 839, 'SSG': 826, 'SSL': 757, 'PSL': 50...","{'SSL': 750, 'SGL': 405, 'SVL': 400, 'SGG': 39...","{'SSL': 726, 'SGL': 405, 'SVL': 400, 'SGG': 39...","{'SSL': 610, 'EAL': 344, 'SGL': 342, 'SVL': 33...","{'SGL': 972, 'SSL': 889, 'SSG': 705, 'PSL': 61...","{'SGL': 972, 'SSL': 860, 'SSG': 705, 'PSL': 61...","{'SGL': 906, 'SSL': 776, 'SSG': 667, 'PSL': 61..."
620,IPR003599,2201,"{'SSG': 804, 'SGL': 774, 'SSL': 687, 'SPG': 43...","{'SSG': 800, 'SGL': 774, 'SSL': 654, 'SPG': 43...","{'SSG': 733, 'SGL': 656, 'SSL': 545, 'SPG': 42...","{'SSL': 594, 'SVL': 456, 'SGG': 425, 'SGL': 36...","{'SSL': 543, 'SVL': 456, 'SGG': 418, 'SGL': 36...","{'SSL': 455, 'SVL': 409, 'SGG': 391, 'SGL': 31...","{'SGL': 771, 'SSL': 718, 'SSG': 584, 'LLL': 43...","{'SGL': 771, 'SSL': 685, 'SSG': 584, 'SPG': 42...","{'SGL': 709, 'SSL': 582, 'SSG': 550, 'SPG': 41..."
366,IPR001881,1531,"{'SCL': 475, 'SPG': 247, 'SSG': 167, 'SGL': 12...","{'SCL': 475, 'ECV': 262, 'SPG': 247, 'EGG': 22...","{'DEC': 702, 'SCL': 472, 'ECV': 262, 'EGG': 22...","{'SCL': 266, 'SGG': 246, 'SPG': 194, 'SEG': 17...","{'SCL': 266, 'SGG': 246, 'SPG': 194, 'RCL': 19...","{'DEC': 706, 'SCL': 251, 'SGG': 232, 'RCL': 19...","{'SPG': 266, 'SGG': 181, 'SSG': 160, 'SGL': 12...","{'DEA': 490, 'SCL': 469, 'SPG': 266, 'ECV': 22...","{'DIDEC': 646, 'SCL': 465, 'SPG': 244, 'ECV': ..."
...,...,...,...,...,...,...,...,...,...,...,...
7390,IPR056310,1,"{'SEL': 2, 'EAL': 1, 'SLL': 1, 'ETL': 1, 'AAG'...","{'SEL': 2, 'TVY': 1, 'EAL': 1, 'SLL': 1, 'PEE'...","{'SEL': 1, 'DFG': 1, 'TVY': 1, 'EAL': 1, 'SLH'...","{'EAL': 2, 'SEV': 1, 'SLL': 1, 'SEL': 1}","{'EAL': 2, 'SEV': 1, 'EFG': 1, 'SLQ': 1, 'SLL'...","{'EAL': 2, 'SEV': 1, 'EFG': 1, 'TVY': 1, 'SLQ'...","{'SEL': 2, 'EAL': 1, 'SLL': 1, 'ETL': 1, 'AAG'...","{'SEL': 2, 'EAL': 1, 'SLH': 1, 'SLL': 1, 'PEE'...","{'MSP': 1, 'SEL': 1, 'DFG': 1, 'TVY': 1, 'EAL'..."
7395,IPR056328,1,"{'SHL': 1, 'RLL': 1, 'SEA': 1, 'EEL': 1}","{'NKL': 2, 'EDG': 1, 'SHL': 1, 'SLY': 1, 'RLL'...","{'NKL': 2, 'EDG': 1, 'MEA': 1, 'SHL': 1, 'SLY'...","{'RLL': 1, 'SEK': 1, 'EEL': 1, 'SKL': 1}","{'EDG': 1, 'QQL': 1, 'SLY': 1, 'RLL': 1, 'QSV'...","{'EDG': 1, 'MDA': 1, 'QQL': 1, 'SLY': 1, 'RLL'...","{'SHL': 1, 'RLL': 1, 'ELL': 1, 'SEA': 1, 'EEL'...","{'NKL': 2, 'QQQQ': 2, 'EDG': 1, 'SHL': 1, 'RLL...","{'NKL': 2, 'EDG': 1, 'MEA': 1, 'SHL': 1, 'ALAL..."
7396,IPR056331,1,"{'LLL': 1, 'ELL': 1}","{'PLLL': 1, 'ELL': 1, 'SSY': 1, 'ALV': 1}","{'KAKI': 1, 'PLLL': 1, 'ELL': 1, 'SSY': 1, 'DS...","{'LLL': 1, 'ELL': 1}","{'SLLL': 1, 'ELL': 1, 'DEN': 1, 'ALI': 1}","{'SLLL': 1, 'ELL': 1, 'DSY': 1, 'DEN': 1, 'ALI...","{'LLL': 1, 'ELL': 1}","{'PLLL': 1, 'ELL': 1, 'ESY': 1, 'ALV': 1}","{'KAKI': 1, 'PLLL': 1, 'ELL': 1, 'ESY': 1, 'DK..."
7397,IPR056332,1,"{'SEL': 2, 'RLL': 2, 'SSG': 2, 'EAA': 1, 'PSP'...","{'FGV': 2, 'EKV': 2, 'SEL': 2, 'RLL': 2, 'EGG'...","{'DGLL': 2, 'FGV': 2, 'EKV': 2, 'SEL': 2, 'RLL...","{'EKL': 2, 'SEL': 2, 'SVL': 2, 'RLL': 2, 'EAA'...","{'EKL': 2, 'SEL': 2, 'SVL': 2, 'RLL': 2, 'KLV'...","{'EKL': 2, 'SEL': 2, 'SESV': 2, 'SVL': 2, 'RLL...","{'SRL': 2, 'SEL': 2, 'RLL': 2, 'SGG': 2, 'SSG'...","{'FGV': 2, 'SRL': 2, 'EKV': 2, 'SEL': 2, 'SDV'...","{'DGLL': 2, 'FGV': 2, 'SRL': 2, 'GGLL': 2, 'EK..."


In [270]:
list(df_domains_family_counts_agg.sort_values('uniprot_id', ascending=False).iloc[4]['mutBPE pre blosum62 0.7 0.05 12800 families'].items())[:10]

[('DIDEC', 646),
 ('SCL', 465),
 ('SPG', 244),
 ('ECV', 223),
 ('PEG', 216),
 ('SGG', 180),
 ('YSG', 174),
 ('RCL', 163),
 ('SSG', 146),
 ('YKC', 125)]

In [265]:
[k for k, v in inner_vocab_mutated_list['mutBPE pre blosum62 0.7 0.05 12800'].items() if v['parent'] == 'DIDEC']

['DVDEC',
 'DLDEC',
 'DMDEC',
 'DFDEC',
 'DINEC',
 'DVNEC',
 'ELDEC',
 'DLDSC',
 'SVDEC',
 'NVDDC']

In [266]:
[k for k, v in inner_vocab_mutated_list['mutBPE pre blosum62 0.7 0.05 12800'].items() if v['parent'] == 'SCL']

['SCI',
 'SCM',
 'SCV',
 'ACL',
 'NCL',
 'TCL',
 'SCF',
 'DCL',
 'QCL',
 'ECL',
 'GCL',
 'KCL',
 'ACI',
 'NCI',
 'TCI']