In [50]:
import sqlite3
import pandas as pd
from tqdm import tqdm
import numpy as np
from tokenizers import Tokenizer
import json
from collections import Counter
from EfficientBPE.vocabulary_functions import get_mutated, get_parents, set_difference, set_intersection, load_tokenizers

ImportError: cannot import name 'load_tokenizers' from 'EfficientBPE.vocabulary_functions' (/cta/users/bsuyunu/github/evolutionary-subword-tokenization/Prog/EfficientBPE/vocabulary_functions.py)

## Load Tokenizers

In [36]:
# 'dataset': {'uniref50', 'uniref90'}
# 'is_pretokenizer': {True, False}
# 'subs_matrix': {'blosum45', 'blosum62', 'pam70', 'pam250'}
# 'mutation_cutoff': {0.7, 0.8, 0.9}
# 'min_mutation_freq': {0, 0.05,. 0.005}
# 'min_mutation_len': {3}
# 'max_mutation_len': {12}
# 'vocab_size': list=[800, 1600, 3200, 6400, 12800, 25600, 51200]

vocab_sizes = [800, 3200, 12800]
uniref_id = "50"

tokenizer_opts_list = [
    {
        'is_mut': False,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': False,
        'vocab_size': vocab_sizes
    },
    # {
    #     'is_mut': False,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': True,
    #     'vocab_size': vocab_sizes
    # },
    # {
    #     'is_mut': True,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': False,
    #     'subs_matrix': 'blosum62',
    #     'mutation_cutoff': 0.7,
    #     'min_mutation_freq': 0,
    #     'min_mutation_len': 3,
    #     'max_mutation_len': 12,
    #     'vocab_size': vocab_sizes
    # },
    # {
    #     'is_mut': True,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': False,
    #     'subs_matrix': 'blosum62',
    #     'mutation_cutoff': 0.9,
    #     'min_mutation_freq': 0,
    #     'min_mutation_len': 3,
    #     'max_mutation_len': 12,
    #     'vocab_size': vocab_sizes
    # },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': False,
        'subs_matrix': 'blosum62',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': False,
        'subs_matrix': 'pam70',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': True,
        'subs_matrix': 'blosum62',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
]

In [37]:
tokenizer_list = load_tokenizers(tokenizer_opts_list, 'hf')
inner_vocab_list = load_tokenizers(tokenizer_opts_list, 'vocab')

vocab_list = {}
for name, tokenizer in tokenizer_list.items():
    vocab_list[name] = list(set([token for token, idx in tokenizer.get_vocab().items()]))

## Read Datasets

In [38]:
# Connect to DB
db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)

df_protein = pd.read_sql(f"""SELECT Entry as uniprot_id, Sequence as sequence
                          FROM proteins
                          WHERE Entry IN (SELECT uniprot_accession FROM uniref{uniref_id}_distilled)""", conn)
df_protein = df_protein[df_protein['sequence'].str.len() < 3000]

# df_protein_pre = pd.read_sql(f"SELECT * FROM uniref{uniref_id}_domain_sliced_plddt70", conn)
# df_protein_pre = df_protein_pre[~df_protein_pre['uniprot_id'].isin(df_protein[df_protein['sequence'].str.len() > 3000]['uniprot_id'].unique())]

df_interpro_domain = pd.read_sql(f"SELECT uniprot_id, interpro_id as source, start_index, end_index FROM interpro_entries_v2 WHERE type='domain'", conn)
df_ted = pd.read_sql(f"SELECT uniprot_id, ted_id as source, start_index, end_index FROM ted_entries_summary WHERE plddt >= 70", conn)

conn.close()

In [39]:
df_domains = pd.concat([df_interpro_domain, df_ted])
# Find uniprot_ids that have "interpro" as a source
interpro_ids = df_domains.loc[df_domains["source"].str.startswith("IPR"), "uniprot_id"].unique()
# Filter the DataFrame to exclude rows with source "ted" for those uniprot_ids
df_domains = df_domains[~((df_domains["uniprot_id"].isin(interpro_ids)) & (df_domains["source"].str.startswith("AF")))]
df_domains = df_protein.set_index('uniprot_id').join(df_domains.set_index('uniprot_id'), how='inner').reset_index()
df_domains['domain_sequence'] = df_domains.apply(lambda row: row['sequence'][row['start_index']-1: row['end_index']], axis=1)
df_domains = df_domains[df_domains['domain_sequence'].str.len()>0]

df_domains = df_domains[df_domains['source'].str.startswith('IPR')] # just keep interpro entries

df_domains = df_domains[['uniprot_id', 'source', 'domain_sequence']].reset_index(drop=True)
df_domains

Unnamed: 0,uniprot_id,source,domain_sequence
0,A0A087X296,IPR000742,PVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCT
1,A0A0K2S4Q6,IPR003599,PSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETGG...
2,A0A0K2S4Q6,IPR007110,PGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIW...
3,A0A0K2S4Q6,IPR013106,GPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETG...
4,A0A3B3ISZ0,IPR001206,AQVKKATVFLNPAACKGKARTLFEKNAAPILHLSGMDVTIVKTDYE...
...,...,...,...
99798,X6RHN7,IPR028889,KGLSNEPGQNSCFLNSALQVLWHLDIFRRSFRQLTTHKCMGDSCIF...
99799,X6RIL1,IPR005302,RPRRPHQIADLFRPKDQIAYSDTSPFLILSEASLADLNSRLEKKVK...
99800,X6RK39,IPR025946,LPRVLRVCSGVYFEGSIYEISGNECCLSTGDLIKVTQVRLQKVVCE...
99801,X6RK39,IPR025946,ILEVPEGRPIFLSPWVGSLQKGQRLCVYGLASPPWRVLASSKGRKV...


In [40]:
for name, tokenizer in tqdm(list(tokenizer_list.items())):
    df_domains[name] = [enc.tokens for enc in tokenizer.encode_batch(df_domains['domain_sequence'])]

100%|██████████| 12/12 [00:14<00:00,  1.21s/it]


In [41]:
df_domains.head()

Unnamed: 0,uniprot_id,source,domain_sequence,stdBPE 800,stdBPE 3200,stdBPE 12800,mutBPE blosum62 0.7 0.05 800,mutBPE blosum62 0.7 0.05 3200,mutBPE blosum62 0.7 0.05 12800,mutBPE pam70 0.7 0.05 800,mutBPE pam70 0.7 0.05 3200,mutBPE pam70 0.7 0.05 12800,mutBPE pre blosum62 0.7 0.05 800,mutBPE pre blosum62 0.7 0.05 3200,mutBPE pre blosum62 0.7 0.05 12800
0,A0A087X296,IPR000742,PVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCT,"[PV, N, PC, C, YY, PC, QH, QG, I, CV, RF, GLD,...","[PVN, PCC, YY, PC, QH, QGI, CV, RF, GLD, RYQ, ...","[PVN, PCC, YY, PC, QH, QGI, CV, RF, GLD, RYQ, ...","[PV, N, PC, C, YY, PC, QH, QG, I, CV, RF, GL, ...","[PV, N, PC, C, YY, PC, QH, QGI, CV, RF, GLD, R...","[PV, NPC, C, YY, PC, QH, QGI, CV, RF, GLD, RY,...","[PV, N, PC, C, YY, PC, QH, QG, I, CV, RF, GL, ...","[PV, N, PC, C, YY, PC, QH, QGI, CV, RF, GLD, R...","[PV, NPC, CYY, PC, QH, QGI, CV, RF, GLD, RY, Q...","[PV, N, PC, C, YY, PC, QH, QG, I, CV, RF, GL, ...","[PV, N, PC, C, YY, PC, QH, QGI, CV, RF, GLD, R...","[PV, NPC, C, YY, PC, QH, QGI, CV, RF, GLD, RY,..."
1,A0A0K2S4Q6,IPR003599,PSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETGG...,"[PST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, KT,...","[PST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, KT,...","[PSTV, MG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, KT,...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, V, MG, AV, G, ESL, SV, QC, RY, EE, KY, K...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, KT,...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F...","[PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, F..."
2,A0A0K2S4Q6,IPR007110,PGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIW...,"[PG, CL, TV, SG, PST, V, MG, AVG, ESL, SV, QC,...","[PG, CL, TV, SG, PST, V, MG, AVG, ESL, SV, QC,...","[PG, CL, TVSG, PSTV, MG, AVG, ESL, SV, QC, RY,...","[PG, CL, TV, SG, PST, V, MG, AVG, ESL, SV, QC,...","[PG, CL, TV, SG, PST, VMG, AVG, ESL, SV, QC, R...","[PG, CL, TVSG, PST, VMG, AVG, ESL, SV, QC, RY,...","[PG, CL, TV, SG, PST, V, MG, AV, G, ESL, SV, Q...","[PG, CL, TV, SG, PST, VMG, AVG, ESL, SV, QC, R...","[PG, CL, TVSG, PST, VMG, AVG, ESL, SV, QC, RY,...","[PG, CL, TV, SG, PST, V, MG, AVG, ESL, SV, QC,...","[PG, CL, TV, SG, PST, VMG, AVG, ESL, SV, QC, R...","[PG, CL, TVSG, PST, VMG, AVG, ESL, SV, QC, RY,..."
3,A0A0K2S4Q6,IPR013106,GPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETG...,"[G, PST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, ...","[G, PST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, ...","[G, PSTV, MG, AVG, ESL, SV, QC, RY, EE, KY, KT...","[GP, ST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, ...","[GP, STV, MG, AVG, ESL, SV, QC, RY, EE, KY, KT...","[GP, STV, MG, AVG, ESL, SV, QC, RY, EE, KY, KT...","[G, PST, V, MG, AV, G, ESL, SV, QC, RY, EE, KY...","[G, PST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT...","[GPST, VMG, AVG, ESL, SV, QC, RY, EE, KY, KT, ...","[GP, ST, V, MG, AVG, ESL, SV, QC, RY, EE, KY, ...","[GP, STV, MG, AVG, ESL, SV, QC, RY, EE, KY, KT...","[GP, STV, MG, AVG, ESL, SV, QC, RY, EE, KY, KT..."
4,A0A3B3ISZ0,IPR001206,AQVKKATVFLNPAACKGKARTLFEKNAAPILHLSGMDVTIVKTDYE...,"[A, QV, KKA, TV, FL, NP, AA, C, KG, KA, RTL, F...","[A, QV, KKA, TV, FL, NP, AAC, KG, KA, RTL, F, ...","[A, QV, KKA, TVFL, NP, AAC, KGKA, RTL, F, EKN,...","[A, QV, KK, A, TV, FL, NP, AA, C, KG, KA, R, T...","[AQV, KK, ATV, FL, NP, AAC, KG, KA, RTL, F, EK...","[AQV, KK, ATV, FL, NP, AAC, KG, KA, RTLF, EK, ...","[A, QV, KK, ATV, FL, NP, AA, C, KG, KA, RTL, F...","[AQV, KK, ATV, FL, NP, AA, CKG, KA, RTL, FEK, ...","[AQV, KK, ATV, FL, NPAA, CKG, KA, RTL, FEK, NA...","[A, QV, KK, A, TV, FL, NP, AA, C, KG, KA, R, T...","[AQV, KK, ATV, FL, NP, AA, C, KG, KA, RTL, FEK...","[AQV, KK, ATV, FL, NP, AAC, KG, KA, RTL, FEK, ..."


In [42]:
inner_vocab_parents_list = {}
inner_vocab_mutated_list = {}
inner_vocab_family_list = {}
for k, v in inner_vocab_list.items():
    inner_vocab_parents_list[k] = get_parents(v)
    inner_vocab_mutated_list[k] = get_mutated(v)
    inner_vocab_family_list[k] = {p:0 for p in inner_vocab_parents_list[k].keys()}

In [43]:
def families_in_domains(row):
    family_counts_dict = {}
    for method_name in tokenizer_list.keys():
        # family_counts = inner_vocab_family_list[method_name].copy()
        family_counts = {}
        for token in row[method_name]:
            if token in inner_vocab_parents_list[method_name]:
                family_counts[token] = family_counts.get(token, 0) + 1
            if token in inner_vocab_mutated_list[method_name]:
                parent = inner_vocab_mutated_list[method_name][token]['parent']
                family_counts[parent] = family_counts.get(parent, 0) + 1
        family_counts_dict[f"{method_name} families"] = family_counts
    return family_counts_dict

df_domains_family_counts = pd.DataFrame.from_dict(list(df_domains.apply(families_in_domains, axis=1)))
df_domains_family_counts = pd.concat([df_domains.loc[:, :'domain_sequence'], df_domains_family_counts], axis=1)
df_domains_family_counts.head()

Unnamed: 0,uniprot_id,source,domain_sequence,stdBPE 800 families,stdBPE 3200 families,stdBPE 12800 families,mutBPE blosum62 0.7 0.05 800 families,mutBPE blosum62 0.7 0.05 3200 families,mutBPE blosum62 0.7 0.05 12800 families,mutBPE pam70 0.7 0.05 800 families,mutBPE pam70 0.7 0.05 3200 families,mutBPE pam70 0.7 0.05 12800 families,mutBPE pre blosum62 0.7 0.05 800 families,mutBPE pre blosum62 0.7 0.05 3200 families,mutBPE pre blosum62 0.7 0.05 12800 families
0,A0A087X296,IPR000742,PVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCT,{},{},{},{},"{'EGV': 1, 'SLD': 1}","{'EPC': 1, 'EGV': 1, 'SLD': 1, 'AGY': 1}",{},"{'QGV': 1, 'SLD': 1, 'CSR': 1, 'YSG': 1}","{'KPC': 1, 'CFY': 1, 'QGV': 1, 'SLD': 1, 'CSR'...",{},"{'EGV': 1, 'SLD': 1, 'YSG': 1}","{'EPC': 1, 'EGV': 1, 'SLD': 1, 'YSG': 1}"
1,A0A0K2S4Q6,IPR003599,PSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETGG...,{},{},{},"{'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'...","{'PSS': 1, 'EAL': 1, 'SCL': 1, 'SEG': 1, 'ENL'...","{'PSS': 1, 'VLG': 1, 'AVG': 1, 'EAL': 1, 'SCL'...","{'PSS': 1, 'VLG': 1, 'AVG': 1, 'EAL': 1, 'SCL'...","{'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'..."
2,A0A0K2S4Q6,IPR007110,PGCLTVSGPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIW...,{},{},{},"{'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'...","{'TASG': 1, 'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL...","{'PSS': 1, 'EAL': 1, 'SCL': 1, 'SEG': 1, 'ENL'...","{'PSS': 1, 'VLG': 1, 'AVG': 1, 'EAL': 1, 'SCL'...","{'TVSS': 1, 'PSS': 1, 'VLG': 1, 'AVG': 1, 'EAL...","{'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL'...","{'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1, 'RSP'...","{'TVSS': 1, 'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL..."
3,A0A0K2S4Q6,IPR013106,GPSTVMGAVGESLSVQCRYEEKYKTFNKYWCRQPCLPIWHEMVETG...,{},{},{},"{'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL': 1}","{'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1, 'EVV'...","{'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1, 'EVV'...","{'PSS': 1, 'EAL': 1, 'SCL': 1, 'SEG': 1, 'ENL'...","{'PSS': 1, 'VLG': 1, 'AVG': 1, 'EAL': 1, 'SCL'...","{'SPST': 1, 'VLG': 1, 'AVG': 1, 'EAL': 1, 'SCL...","{'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL': 1}","{'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1, 'EVV'...","{'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1, 'EVV'..."
4,A0A3B3ISZ0,IPR001206,AQVKKATVFLNPAACKGKARTLFEKNAAPILHLSGMDVTIVKTDYE...,{},{},{},"{'SAA': 1, 'PLL': 1, 'SGL': 1, 'ELL': 1, 'SET'...","{'SQV': 1, 'STV': 1, 'AAC': 1, 'RTL': 1, 'SAA'...","{'SQV': 1, 'STV': 1, 'AAC': 1, 'RTLL': 1, 'SAA...","{'AAV': 1, 'RAL': 1, 'SAA': 1, 'SGL': 1, 'ELL'...","{'TQV': 1, 'AAV': 1, 'CRG': 1, 'RAL': 1, 'FEK'...","{'TQV': 1, 'AAV': 1, 'TPAA': 1, 'CRG': 1, 'RAL...","{'SAA': 1, 'PLL': 1, 'SGL': 1, 'ELL': 1, 'SET'...","{'SQV': 1, 'STV': 1, 'RTL': 1, 'FEK': 1, 'SAA'...","{'SQV': 1, 'STV': 1, 'AAC': 1, 'RTL': 1, 'FEK'..."


In [44]:
import pandas as pd
from collections import Counter

# Input DataFrame
data = {
    'uniprot_id': ['A1', 'A2', 'A3', 'A4', 'A5', 'A6'],
    'source': ['IPR000742', 'IPR003599', 'IPR013106', 'IPR003599', 'IPR013106', 'IPR001206'],
    'method_1': [
        {},
        {'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1},
        {'PSS': 1, 'AAG': 1, 'EAL': 1, 'SSG': 1},
        {'PSS': 1, 'AAG': 1, 'EAL': 1},
        {'AAG': 1, 'EAL': 1, 'SSG': 1, 'ENL': 1},
        {'SAA': 1, 'PLL': 1, 'SGL': 1, 'ELL': 1}
    ],
    'method_2': [
        {'EGV': 1, 'SLD': 1},
        {'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1},
        {'PSS': 1, 'VVG': 1, 'EAL': 1},
        {'PSS': 1, 'VVG': 1, 'AAG': 1, 'EAL': 1},
        {'STV': 1, 'AAG': 1, 'EAL': 1, 'RSP': 1},
        {'SQV': 1, 'STV': 1, 'AAC': 1, 'RTL': 1}
    ]
}
df = pd.DataFrame(data)

# Function to sum dictionaries
def sum_dicts(dicts):
    return dict(sum((Counter(d) for d in dicts), Counter()))

# Function to sum and sort dictionaries
def sum_and_sort_dicts(dicts):
    summed = sum((Counter(d) for d in dicts), Counter())
    # Sort by values in descending order and return as a dictionary
    return dict(sorted(summed.items(), key=lambda item: item[1], reverse=True))

# Group by 'source' and sum/sort the dictionaries
result = df.groupby('source').agg({
    'method_1': sum_and_sort_dicts,
    'method_2': sum_and_sort_dicts
}).reset_index()

result


Unnamed: 0,source,method_1,method_2
0,IPR000742,{},"{'EGV': 1, 'SLD': 1}"
1,IPR001206,"{'SAA': 1, 'PLL': 1, 'SGL': 1, 'ELL': 1}","{'SQV': 1, 'STV': 1, 'AAC': 1, 'RTL': 1}"
2,IPR003599,"{'PSS': 2, 'AAG': 2, 'EAL': 2, 'SSG': 1}","{'PSS': 2, 'VVG': 2, 'AAG': 2, 'EAL': 2}"
3,IPR013106,"{'AAG': 2, 'EAL': 2, 'SSG': 2, 'PSS': 1, 'ENL'...","{'EAL': 2, 'PSS': 1, 'VVG': 1, 'STV': 1, 'AAG'..."


In [46]:
agg_dict = {'uniprot_id': len}
agg_dict.update({f'{method_name}':sum_and_sort_dicts for method_name in tokenizer_list.keys()})

df_domains_counts_agg = df_domains.groupby('source').agg(agg_dict).reset_index()
df_domains_counts_agg.head()

Unnamed: 0,source,uniprot_id,stdBPE 800,stdBPE 3200,stdBPE 12800,mutBPE blosum62 0.7 0.05 800,mutBPE blosum62 0.7 0.05 3200,mutBPE blosum62 0.7 0.05 12800,mutBPE pam70 0.7 0.05 800,mutBPE pam70 0.7 0.05 3200,mutBPE pam70 0.7 0.05 12800,mutBPE pre blosum62 0.7 0.05 800,mutBPE pre blosum62 0.7 0.05 3200,mutBPE pre blosum62 0.7 0.05 12800
0,IPR000001,93,"{'C': 194, 'RN': 93, 'Y': 92, 'RG': 87, 'W': 7...","{'RN': 90, 'C': 85, 'RG': 76, 'PW': 68, 'CY': ...","{'RG': 68, 'RN': 54, 'CY': 53, 'PW': 42, 'NYC'...","{'C': 280, 'Y': 157, 'W': 107, 'RG': 93, 'RN':...","{'C': 131, 'RN': 92, 'RG': 86, 'W': 65, 'PW': ...","{'RN': 85, 'RG': 63, 'CY': 55, 'PW': 47, 'TT':...","{'C': 218, 'W': 100, 'Y': 97, 'RN': 93, 'RG': ...","{'C': 110, 'RN': 88, 'PW': 69, 'CY': 55, 'W': ...","{'PW': 69, 'CY': 55, 'CRN': 52, 'NY': 46, 'RG'...","{'C': 249, 'Y': 149, 'W': 104, 'RG': 93, 'RN':...","{'C': 107, 'RN': 92, 'PW': 60, 'W': 60, 'CY': ...","{'RN': 84, 'CY': 51, 'YRG': 43, 'PD': 43, 'PW'..."
1,IPR000007,13,"{'T': 26, 'I': 24, 'Y': 22, 'R': 19, 'RG': 15,...","{'QA': 14, 'PQ': 13, 'RG': 13, 'EL': 13, 'SY':...","{'RG': 12, 'KN': 10, 'RD': 8, 'MD': 8, 'YLI': ...","{'R': 28, 'N': 28, 'T': 22, 'Y': 22, 'I': 21, ...","{'RG': 13, 'KN': 13, 'SR': 12, 'TV': 11, 'RV':...","{'RG': 13, 'MD': 10, 'KN': 10, 'RV': 10, 'TV':...","{'R': 26, 'T': 25, 'Y': 22, 'I': 21, 'D': 17, ...","{'RG': 14, 'RV': 14, 'PQ': 13, 'SV': 13, 'KN':...","{'PQ': 13, 'KN': 13, 'RG': 11, 'SY': 11, 'SV':...","{'R': 28, 'Y': 26, 'M': 24, 'D': 22, 'I': 21, ...","{'RG': 13, 'KN': 13, 'SR': 12, 'PQ': 11, 'TV':...","{'RG': 13, 'MD': 10, 'PT': 10, 'KN': 10, 'RV':..."
2,IPR000008,501,"{'F': 567, 'N': 519, 'I': 468, 'D': 457, 'TL':...","{'TL': 269, 'PV': 246, 'EV': 245, 'TV': 218, '...","{'PV': 148, 'PYV': 143, 'KL': 140, 'FD': 133, ...","{'W': 656, 'F': 609, 'Y': 586, 'N': 581, 'D': ...","{'TL': 362, 'W': 298, 'EV': 280, 'TV': 242, 'F...","{'KV': 173, 'KL': 167, 'TV': 162, 'TL': 161, '...","{'F': 605, 'W': 541, 'V': 527, 'D': 514, 'Y': ...","{'TL': 348, 'KV': 252, 'TV': 251, 'EV': 246, '...","{'KV': 222, 'TL': 186, 'EV': 176, 'TV': 175, '...","{'F': 586, 'W': 583, 'R': 563, 'D': 546, 'K': ...","{'TL': 374, 'EV': 267, 'W': 265, 'TV': 243, 'K...","{'TV': 173, 'KV': 171, 'TL': 170, 'KL': 161, '..."
3,IPR000010,30,"{'N': 42, 'F': 35, 'Y': 32, 'D': 32, 'C': 31, ...","{'VV': 16, 'ET': 15, 'N': 13, 'NA': 13, 'TL': ...","{'NA': 13, 'SF': 11, 'ET': 10, 'ND': 9, 'VV': ...","{'N': 39, 'D': 37, 'C': 34, 'F': 33, 'Y': 33, ...","{'C': 20, 'N': 16, 'NA': 16, 'TL': 13, 'Y': 13...","{'NA': 15, 'DC': 12, 'ET': 12, 'SN': 11, 'NC':...","{'D': 38, 'C': 36, 'N': 35, 'Y': 33, 'F': 31, ...","{'C': 22, 'D': 16, 'NA': 16, 'N': 15, 'Y': 14,...","{'KV': 12, 'C': 10, 'ET': 10, 'NY': 9, 'PW': 9...","{'C': 42, 'D': 37, 'Q': 36, 'Y': 35, 'N': 34, ...","{'C': 17, 'W': 13, 'D': 13, 'Q': 13, 'N': 13, ...","{'ET': 13, 'DC': 12, 'TN': 11, 'SN': 10, 'SF':..."
4,IPR000014,110,"{'H': 93, 'Y': 77, 'FL': 76, 'F': 73, 'N': 72,...","{'DG': 57, 'EL': 51, 'H': 47, 'FL': 44, 'IL': ...","{'DG': 50, 'FV': 33, 'FL': 33, 'PQ': 32, 'SF':...","{'H': 117, 'V': 105, 'Y': 92, 'F': 82, 'Q': 80...","{'Y': 58, 'DG': 55, 'FV': 50, 'H': 48, 'EL': 4...","{'FV': 47, 'DG': 39, 'PQ': 36, 'EL': 35, 'SE':...","{'H': 110, 'Y': 85, 'F': 84, 'D': 83, 'FL': 78...","{'DG': 55, 'FL': 48, 'FV': 47, 'H': 46, 'SE': ...","{'FV': 43, 'FL': 39, 'YL': 37, 'DG': 34, 'PQ':...","{'H': 109, 'V': 104, 'M': 90, 'D': 90, 'Y': 90...","{'DG': 55, 'FV': 51, 'H': 42, 'SV': 42, 'IL': ...","{'FV': 47, 'DG': 40, 'PQ': 34, 'TG': 34, 'EL':..."


In [47]:
agg_dict = {'uniprot_id': len}
agg_dict.update({f'{method_name} families':sum_and_sort_dicts for method_name in tokenizer_list.keys()})

df_domains_family_counts_agg = df_domains_family_counts.groupby('source').agg(agg_dict).reset_index()
df_domains_family_counts_agg.head()

Unnamed: 0,source,uniprot_id,stdBPE 800 families,stdBPE 3200 families,stdBPE 12800 families,mutBPE blosum62 0.7 0.05 800 families,mutBPE blosum62 0.7 0.05 3200 families,mutBPE blosum62 0.7 0.05 12800 families,mutBPE pam70 0.7 0.05 800 families,mutBPE pam70 0.7 0.05 3200 families,mutBPE pam70 0.7 0.05 12800 families,mutBPE pre blosum62 0.7 0.05 800 families,mutBPE pre blosum62 0.7 0.05 3200 families,mutBPE pre blosum62 0.7 0.05 12800 families
0,IPR000001,93,{},{},{},"{'SGL': 37, 'SSL': 24, 'SSG': 23, 'SLG': 17, '...","{'PEG': 47, 'SGL': 37, 'TPR': 27, 'SSY': 25, '...","{'SYC': 62, 'PEG': 47, 'SGL': 34, 'TPR': 27, '...","{'SGL': 37, 'SSL': 24, 'SLG': 16, 'SGG': 15, '...","{'FRG': 41, 'PDG': 38, 'SGL': 37, 'SSL': 24, '...","{'CRN': 53, 'FRG': 41, 'PDG': 38, 'SGL': 34, '...","{'SGL': 44, 'SSL': 24, 'SLG': 18, 'SSG': 15, '...","{'YRG': 51, 'PEG': 48, 'SGL': 44, 'TPR': 26, '...","{'YRG': 51, 'PEG': 48, 'SYC': 35, 'SGL': 30, '..."
1,IPR000007,13,{},{},{},"{'SSL': 9, 'AAL': 6, 'PGL': 5, 'SSG': 5, 'SLL'...","{'YLV': 14, 'SSL': 9, 'PSY': 7, 'RPA': 6, 'KEG...","{'YLV': 11, 'SYLL': 8, 'PSY': 7, 'FQLL': 7, 'R...","{'SSL': 9, 'SLL': 6, 'SKL': 5, 'SVL': 4, 'ENL'...","{'YLV': 13, 'RKK': 10, 'SSL': 9, 'RAA': 7, 'TK...","{'YLV': 13, 'RKK': 10, 'RAA': 7, 'TKD': 7, 'QG...","{'SSL': 9, 'FLL': 7, 'LLG': 6, 'AAL': 6, 'SSG'...","{'YLV': 14, 'SSL': 9, 'FLL': 7, 'FPL': 7, 'RPA...","{'YLV': 11, 'SSL': 9, 'SYLL': 8, 'KFG': 8, 'FL..."
2,IPR000008,501,{},{},{},"{'EEL': 157, 'SGL': 139, 'SFL': 125, 'LLG': 12...","{'PYL': 161, 'EEL': 151, 'SGL': 139, 'SFL': 12...","{'PYL': 161, 'EEL': 139, 'SGL': 127, 'LLG': 12...","{'FLL': 109, 'EEL': 106, 'SVL': 102, 'SEL': 95...","{'PYL': 158, 'FLG': 127, 'FLL': 109, 'SVL': 10...","{'PYL': 158, 'FLG': 127, 'FLL': 104, 'SVL': 98...","{'EEL': 156, 'SGL': 141, 'SLG': 128, 'LLG': 12...","{'EEL': 155, 'SGL': 141, 'SLG': 128, 'LLG': 12...","{'EEL': 138, 'SLG': 128, 'SGL': 125, 'LLG': 12..."
3,IPR000010,30,{},{},{},"{'EEL': 12, 'SLL': 8, 'SGL': 7, 'KKL': 7, 'AAL...","{'EEL': 12, 'AGV': 8, 'SLL': 8, 'SGL': 7, 'EVV...","{'EEL': 9, 'AGV': 8, 'SGL': 7, 'EVV': 7, 'SLL'...","{'EEL': 8, 'SLL': 8, 'SGL': 7, 'PSS': 7, 'SEV'...","{'EEL': 8, 'SLL': 8, 'SGL': 7, 'CSR': 7, 'PSS'...","{'EEL': 8, 'SGL': 7, 'CSR': 7, 'SEV': 6, 'RSQ'...","{'EEL': 12, 'SGL': 7, 'SLL': 7, 'KKL': 7, 'PSL...","{'EEL': 12, 'QVL': 8, 'YFL': 8, 'FSL': 8, 'SGL...","{'EEL': 9, 'QVL': 8, 'YFL': 8, 'FSL': 8, 'SGL'..."
4,IPR000014,110,{},{},{},"{'SFL': 58, 'EEL': 39, 'ELL': 39, 'SSG': 33, '...","{'SFL': 58, 'ELL': 39, 'EEL': 35, 'SSG': 33, '...","{'SFL': 58, 'SSG': 31, 'SHL': 29, 'ELL': 28, '...","{'ELL': 34, 'SFL': 33, 'SLL': 29, 'SEV': 29, '...","{'ELL': 34, 'SFL': 33, 'SEV': 29, 'SLL': 27, '...","{'SFL': 33, 'ELL': 31, 'SEV': 29, 'FVV': 24, '...","{'SFL': 62, 'EEL': 43, 'ELL': 39, 'SHL': 24, '...","{'SFL': 62, 'EEL': 43, 'ELL': 39, 'SHL': 24, '...","{'SFL': 62, 'EEL': 26, 'SHL': 24, 'LLG': 24, '..."
