In [2]:
import pandas as pd
import numpy as np
import sqlite3
from tqdm import tqdm
import pickle
from pandarallel import pandarallel
from time import time
from tokenizers import Tokenizer
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy.stats import linregress
from goatools.obo_parser import GODag
from goatools.mapslim import mapslim
import requests
import os
import re
from vocabulary_functions import get_mutated, get_parents, set_difference, set_intersection, load_tokenizers, calc_agreement, calc_dice_idx_only

In [3]:
pandarallel.initialize(nb_workers=20, progress_bar=True)

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
# Download GO DAG (ontology structure)
obo_url = "http://purl.obolibrary.org/obo/go/go-basic.obo"
obo_path = "go-basic.obo"

# Download OBO file if not already downloaded
if not os.path.exists(obo_path):
    with open(obo_path, "w") as f:
        f.write(requests.get(obo_url).text)

# Parse GO DAG
go_dag = GODag(obo_path)

# Download GO Slim generic terms
goslim_url = "http://current.geneontology.org/ontology/subsets/goslim_generic.obo"
goslim_path = "goslim_generic.obo"

if not os.path.exists(goslim_path):
    with open(goslim_path, "w") as f:
        f.write(requests.get(goslim_url).text)

# Parse GO Slim DAG
goslim_dag = GODag(goslim_path)
goslim_terms = set(goslim_dag.keys())

go-basic.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms
goslim_generic.obo: fmt(1.2) rel(go/2025-03-16/subsets/goslim_generic.owl) 206 Terms


In [5]:
# 'dataset': {'uniref50', 'uniref90'}
# 'is_pretokenizer': {True, False}
# 'subs_matrix': {'blosum45', 'blosum62', 'pam70', 'pam250'}
# 'mutation_cutoff': {0.7, 0.8, 0.9}
# 'min_mutation_freq': {0, 0.05,. 0.005}
# 'min_mutation_len': {3}
# 'max_mutation_len': {12}
# 'vocab_size': list=[800, 1600, 3200, 6400, 12800, 25600, 51200]

vocab_sizes = [800, 1600, 3200, 6400, 12800, 25600]
vocab_sizes = [1600, 6400, 25600]
uniref_id = "50"

tokenizer_opts_list = [
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': False,
        'subs_matrix': 'blosum62',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
    # {
    #     'is_mut': True,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': False,
    #     'subs_matrix': 'pam70',
    #     'mutation_cutoff': 0.7,
    #     'min_mutation_freq': 0.05,
    #     'min_mutation_len': 3,
    #     'max_mutation_len': 12,
    #     'vocab_size': vocab_sizes
    # },
    # {
    #     'is_mut': True,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': True,
    #     'subs_matrix': 'blosum62',
    #     'mutation_cutoff': 0.7,
    #     'min_mutation_freq': 0.05,
    #     'min_mutation_len': 3,
    #     'max_mutation_len': 12,
    #     'vocab_size': vocab_sizes
    # },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': True,
        'subs_matrix': 'pam70',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
]

In [6]:
tokenizer_list = load_tokenizers(tokenizer_opts_list, 'hf')
inner_vocab_list = load_tokenizers(tokenizer_opts_list, 'vocab')

vocab_list = {}
for name, tokenizer in tokenizer_list.items():
    vocab_list[name] = list(set([token for token, idx in tokenizer.get_vocab().items()]))

In [7]:
methods = [method_name[:-len(str(vocab_sizes[0]))-1] for method_name in list(tokenizer_list.keys())[::len(vocab_sizes)]]
methods2names = {mn:mn.replace('mut', 'evo').replace('std', '').replace('blosum', 'BLOSUM').replace('pam', 'PAM').replace('pre', 'Pre') for mn in methods}
methods2names = {k: ' '.join(v.split()[:-2]) if 'evoBPE' in v else v for k, v in methods2names.items()}
methods2names

{'mutBPE blosum62 0.7 0.05': 'evoBPE BLOSUM62',
 'mutBPE pre pam70 0.7 0.05': 'evoBPE Pre PAM70'}

In [8]:
vocab_lineage_list = {}
for k, v in inner_vocab_list.items():
    template_dict = {
        
    }
    vocab_lineage_list[k] = {token:{
                                'frequency': -1,
                                'order': -1,
                                'parent_pair': [],
                                'parent_mutation': "",
                                'parent_mutation_similarity': -1,
                                'partner_pair_self': False,
                                'partner_pair_left': [],
                                'partner_pair_right': [],
                                'child_pair': [],
                                'child_mutation': []
                            } for token in v.keys()}

for method_name, vocab in tqdm(inner_vocab_list.items()):
    for token, inner_vocab_elements in vocab.items():
        vocab_lineage_list[method_name][token]['frequency'] = inner_vocab_elements['frequency']
        vocab_lineage_list[method_name][token]['order'] = inner_vocab_elements['order']
        vocab_lineage_list[method_name][token]['parent_pair'] = inner_vocab_elements['pair'] if 'pair' in inner_vocab_elements else []
        vocab_lineage_list[method_name][token]['parent_mutation'] = inner_vocab_elements['parent'] if 'parent' in inner_vocab_elements else ""
        vocab_lineage_list[method_name][token]['parent_mutation_similarity'] = inner_vocab_elements['similarity'] if 'similarity' in inner_vocab_elements else -1

        if 'pair' in inner_vocab_elements:
            if inner_vocab_elements['pair'][0] == inner_vocab_elements['pair'][1]:
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][0]]['partner_pair_self'] = True
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][0]]['child_pair'].append(token)
            else:
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][0]]['partner_pair_right'].append(inner_vocab_elements['pair'][1])
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][1]]['partner_pair_left'].append(inner_vocab_elements['pair'][0])
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][0]]['child_pair'].append(token)
                vocab_lineage_list[method_name][inner_vocab_elements['pair'][1]]['child_pair'].append(token)
        if 'parent' in inner_vocab_elements:
                vocab_lineage_list[method_name][inner_vocab_elements['parent']]['child_mutation'].append(token)

100%|██████████| 6/6 [00:00<00:00, 61.08it/s]


In [10]:
vocab_lineage_list['mutBPE blosum62 0.7 0.05 1600']['GII']

{'frequency': 292,
 'order': 974,
 'parent_pair': ['G', 'II'],
 'parent_mutation': 'GLV',
 'parent_mutation_similarity': 0.7857142857142857,
 'partner_pair_self': False,
 'partner_pair_left': [],
 'partner_pair_right': [],
 'child_pair': [],
 'child_mutation': []}

In [11]:
# Connect to DB
db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)

df_protein = pd.read_sql(f"""SELECT Entry as uniprot_id, Sequence as sequence, "Gene Ontology (GO)" as go_all,
                          "Gene Ontology (biological process)" as go_bp, "Gene Ontology (cellular component)" as go_cc, "Gene Ontology (molecular function)" as go_mf
                          FROM proteins
                          WHERE Entry IN (SELECT uniprot_accession FROM uniref{uniref_id}_distilled)""", conn)
df_protein = df_protein[df_protein['sequence'].str.len() < 3000].reset_index(drop=True)

df_protein_sliced = pd.read_sql(f"SELECT uniprot_id, sequence FROM uniref{uniref_id}_domain_sliced_plddt70", conn)
df_protein_sliced = df_protein_sliced[df_protein_sliced['uniprot_id'].isin(df_protein['uniprot_id'])].reset_index(drop=True)

conn.close()

In [12]:
df_protein = df_protein.fillna('')
df_protein['go_ids'] = df_protein['go_all'] + df_protein['go_bp'] + df_protein['go_cc'] + df_protein['go_mf']
df_protein['go_ids'] = df_protein['go_ids'].apply(lambda x: list(set(re.findall(r'GO:\d+', (x)))))
df_protein

Unnamed: 0,uniprot_id,sequence,go_all,go_bp,go_cc,go_mf,go_ids
0,A0A087WZT3,MELSAEYLREKLQRDLEAEHVLPSPGGVGQVRGETAASETQLGS,,,,,[]
1,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,cytoplasm [GO:0005737]; intracellular membrane...,arachidonate metabolic process [GO:0019369]; x...,cytoplasm [GO:0005737]; intracellular membrane...,aromatase activity [GO:0070330]; heme binding ...,"[GO:0020037, GO:0043231, GO:0005737, GO:001671..."
2,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,endoplasmic reticulum membrane [GO:0005789]; G...,prostaglandin biosynthetic process [GO:0001516...,endoplasmic reticulum membrane [GO:0005789]; G...,dioxygenase activity [GO:0051213]; heme bindin...,"[GO:0001516, GO:0020037, GO:0043231, GO:000466..."
3,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,mitochondrial outer membrane [GO:0005741]; mit...,regulation of endoplasmic reticulum unfolded p...,mitochondrial outer membrane [GO:0005741]; mit...,,"[GO:0006986, GO:1900101, GO:0005741, GO:0005739]"
4,A0A0C5B5G6,MRWQEMGYIFYPRKLR,extracellular space [GO:0005615]; mitochondrio...,activation of protein kinase activity [GO:0032...,extracellular space [GO:0005615]; mitochondrio...,DNA binding [GO:0003677]; DNA-binding transcri...,"[GO:0005615, GO:0032147, GO:0003677, GO:007252..."
...,...,...,...,...,...,...,...
70687,X6RL83,MLQEWLAAVGDDYAAVVWRPEGEPRFYPDEEGPKHWTKERHQFLME...,,,,,[]
70688,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...,cytosol [GO:0005829],,cytosol [GO:0005829],,[GO:0005829]
70689,X6RLR1,MAGLTDLQRLQARVEELERWVYGPGGARGSRKVADGLVKVQVALGN...,cytosol [GO:0005829]; dynactin complex [GO:000...,cytoskeleton-dependent cytokinesis [GO:0061640],cytosol [GO:0005829]; dynactin complex [GO:000...,,"[GO:0005829, GO:0005730, GO:0005869, GO:0061640]"
70690,X6RLV5,MSGYSSDRDRGRDRGFGAPRFGGSRAGPLSGKKFGNPGEKLVKKKW...,,,,,[]


In [13]:
# Map full GO terms to GO slim terms
def map_to_goslim(go_terms):
    slim_mapping = {}
    slim_list = []
    for go_id in go_terms:
        if go_id in go_dag:
            slim_terms = mapslim(go_id, go_dag, goslim_dag)
            slim_mapping[go_id] = list(slim_terms[0])
            slim_list.extend(list(slim_terms[0]))
    return list(set(slim_list))#, slim_mapping

# map_to_goslim(['GO:0005764', 'GO:0016020']) --> ['GO:0005764']

df_protein['go_slim_ids'] = df_protein['go_ids'].apply(map_to_goslim)
df_protein = df_protein.drop(columns=['go_all', 'go_bp', 'go_cc', 'go_mf', 'go_ids'])
df_protein = df_protein[df_protein['go_slim_ids'].str.len() > 0].reset_index(drop=True)
df_protein

Unnamed: 0,uniprot_id,sequence,go_slim_ids
0,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"[GO:0006629, GO:0043226, GO:0005739, GO:0016491]"
1,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,"[GO:0016491, GO:0043226, GO:0006629, GO:000579..."
2,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,[GO:0005739]
3,A0A0C5B5G6,MRWQEMGYIFYPRKLR,"[GO:0005615, GO:0003677, GO:0030154, GO:000573..."
4,A0A0K2S4Q6,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,"[GO:0005576, GO:0005886, GO:0048870, GO:0002376]"
...,...,...,...
43520,X6RJD6,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPAGLWTWLRNSLRPSP...,"[GO:0016209, GO:0016491]"
43521,X6RK96,TSVNYLDSAFRNIRNLGIVSVTSTDISSLYAKAQHVARRHYGCNIV...,"[GO:0140098, GO:0016740, GO:0003723, GO:0006399]"
43522,X6RL45,MVRCYVEIVEKLPERRPDPATIEGCAQLKPNNYLLAWHTPFNEKGS...,[GO:0016787]
43523,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...,[GO:0005829]


In [14]:
df_protein['go_slim_names'] = df_protein['go_slim_ids'].apply(lambda lst: [go_dag[go_id].name for go_id in lst])
df_protein['go_slim_types'] = df_protein['go_slim_ids'].apply(lambda lst: [go_dag[go_id].namespace for go_id in lst])
df_protein

Unnamed: 0,uniprot_id,sequence,go_slim_ids,go_slim_names,go_slim_types
0,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"[GO:0006629, GO:0043226, GO:0005739, GO:0016491]","[lipid metabolic process, organelle, mitochond...","[biological_process, cellular_component, cellu..."
1,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,"[GO:0016491, GO:0043226, GO:0006629, GO:000579...","[oxidoreductase activity, organelle, lipid met...","[molecular_function, cellular_component, biolo..."
2,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,[GO:0005739],[mitochondrion],[cellular_component]
3,A0A0C5B5G6,MRWQEMGYIFYPRKLR,"[GO:0005615, GO:0003677, GO:0030154, GO:000573...","[extracellular space, DNA binding, cell differ...","[cellular_component, molecular_function, biolo..."
4,A0A0K2S4Q6,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,"[GO:0005576, GO:0005886, GO:0048870, GO:0002376]","[extracellular region, plasma membrane, cell m...","[cellular_component, cellular_component, biolo..."
...,...,...,...,...,...
43520,X6RJD6,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPAGLWTWLRNSLRPSP...,"[GO:0016209, GO:0016491]","[antioxidant activity, oxidoreductase activity]","[molecular_function, molecular_function]"
43521,X6RK96,TSVNYLDSAFRNIRNLGIVSVTSTDISSLYAKAQHVARRHYGCNIV...,"[GO:0140098, GO:0016740, GO:0003723, GO:0006399]","[catalytic activity, acting on RNA, transferas...","[molecular_function, molecular_function, molec..."
43522,X6RL45,MVRCYVEIVEKLPERRPDPATIEGCAQLKPNNYLLAWHTPFNEKGS...,[GO:0016787],[hydrolase activity],[molecular_function]
43523,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...,[GO:0005829],[cytosol],[cellular_component]


In [15]:
df_protein_sliced = df_protein_sliced[df_protein_sliced['uniprot_id'].isin(df_protein['uniprot_id'])].reset_index(drop=True)
df_protein_sliced

Unnamed: 0,uniprot_id,sequence
0,A8CLL2,VMDNPLVMHQLRCNGVLEGIRICRKGFPNRILYGDFRQ
1,Q68DN1,MELTPGAQQQGINYQELTSGWQDVKSMMLVPEPTRKFPSGPLLTSV...
2,Q156A1,MQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ...
3,A0A410SEU7,TELTFNYNLECLGNGKTVCK
4,A0A994J774,MATAPYNYSYIFKYIIIGDMGVGKSCLLHQFTEKK
...,...,...
181075,A0A0A0N0L1,M
181076,A0A0A0N0L1,AEKQKHDGRVKIGHYVLGDTLGVGTFGKVKNTR
181077,Q16330,M
181078,Q16330,LNSPTICQSYVGQAIEPTPKKFSQCYIIH


In [16]:
all_gos = [go for gos in df_protein['go_slim_types'] for go in gos]
len(Counter(all_gos)), len(all_gos), Counter(all_gos)

(3,
 151058,
 Counter({'cellular_component': 52485,
          'molecular_function': 51394,
          'biological_process': 47179}))

In [17]:
all_gos = [go for gos in df_protein['go_slim_names'] for go in gos]
len(Counter(all_gos)), len(all_gos), Counter(all_gos)

(132,
 151058,
 Counter({'nucleus': 9177,
          'plasma membrane': 7137,
          'cytosol': 6595,
          'transferase activity': 5938,
          'catalytic activity, acting on a protein': 5616,
          'nucleoplasm': 5291,
          'hydrolase activity': 5147,
          'organelle': 4776,
          'regulation of DNA-templated transcription': 4393,
          'DNA binding': 4354,
          'anatomical structure development': 3420,
          'extracellular region': 3228,
          'transporter activity': 2866,
          'molecular function regulator activity': 2836,
          'RNA binding': 2821,
          'immune system process': 2691,
          'transcription regulator activity': 2687,
          'molecular transducer activity': 2315,
          'mitochondrion': 2314,
          'cell differentiation': 2303,
          'lipid metabolic process': 2062,
          'oxidoreductase activity': 1916,
          'cytoskeletal protein binding': 1904,
          'vesicle-mediated transport'

In [18]:
for name, tokenizer in tqdm(list(tokenizer_list.items())):
    if 'pre' in name:
        df_protein_sliced[name] = [enc.tokens for enc in tokenizer.encode_batch(df_protein_sliced['sequence'])]
    else:
        df_protein[name] = [enc.tokens for enc in tokenizer.encode_batch(df_protein['sequence'])]

100%|██████████| 6/6 [00:10<00:00,  1.69s/it]


In [19]:
df_protein_sliced = df_protein_sliced.groupby('uniprot_id').sum().reset_index()
df_protein = df_protein.set_index(['uniprot_id', 'sequence']).join(df_protein_sliced.set_index(['uniprot_id', 'sequence'])).reset_index()
df_protein

Unnamed: 0,uniprot_id,sequence,go_slim_ids,go_slim_names,go_slim_types,mutBPE blosum62 0.7 0.05 1600,mutBPE blosum62 0.7 0.05 6400,mutBPE blosum62 0.7 0.05 25600,mutBPE pre pam70 0.7 0.05 1600,mutBPE pre pam70 0.7 0.05 6400,mutBPE pre pam70 0.7 0.05 25600
0,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,"[GO:0006629, GO:0043226, GO:0005739, GO:0016491]","[lipid metabolic process, organelle, mitochond...","[biological_process, cellular_component, cellu...","[M, GL, EAL, V, PL, AMI, VA, IF, LLLV, DLM, H,...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DLM, HRH, Q...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DLM, HRH, Q...","[MGL, EAL, V, PL, AMI, VA, IF, LLLV, DL, MH, R...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DL, MH, RH,...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DL, MH, RH,..."
1,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,"[GO:0016491, GO:0043226, GO:0006629, GO:000579...","[oxidoreductase activity, organelle, lipid met...","[molecular_function, cellular_component, biolo...","[M, SR, S, LLL, W, FLL, FLL, LL, PPL, PV, LL, ...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, ...","[MSR, SLLL, WFLL, FLLLL, PPL, PVLL, AD, PGA, P...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PV, LL, AD...","[MSR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, PG...","[MSR, SLLL, WFLL, FLL, LL, PPL, PVLL, AD, PGA,..."
2,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,[GO:0005739],[mitochondrion],[cellular_component],"[M, FR, RL, T, FA, QLL, FA, T, VLG, IA, GGV, Y...","[M, FR, RL, TFA, QLL, FA, T, VLG, IA, GGV, YI,...","[M, FRRL, TFA, QLL, FAT, VLG, IA, GGV, YI, FQ,...","[M, FR, RL, T, FA, QLL, FA, TVL, G, IA, GGV, Y...","[M, FR, RL, TFA, QLL, FA, TVL, GIA, GGV, YI, F...","[M, FRRL, TFA, QLL, FA, TVL, GIA, GGV, YI, FQ,..."
3,A0A0C5B5G6,MRWQEMGYIFYPRKLR,"[GO:0005615, GO:0003677, GO:0030154, GO:000573...","[extracellular space, DNA binding, cell differ...","[cellular_component, molecular_function, biolo...","[M, RW, QE, MG, YI, FY, PR, KL, R]","[M, RW, QE, MG, YI, FY, PR, KL, R]","[MRW, QEMG, YI, FY, PRKL, R]","[M, RW, QE, MG, YI, FY, PR, KL, R]","[M, RW, QE, MG, YI, FY, PR, KL, R]","[M, RW, QE, MG, YI, FY, PRKL, R]"
4,A0A0K2S4Q6,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,"[GO:0005576, GO:0005886, GO:0048870, GO:0002376]","[extracellular region, plasma membrane, cell m...","[cellular_component, cellular_component, biolo...","[M, TQ, R, AGAA, ML, PSA, LLLL, CV, PG, CL, TV...","[M, TQR, AGAA, ML, PSA, LLLL, CV, PG, CL, TV, ...","[M, TQR, AGAA, ML, PSA, LLLL, CV, PGCL, TVSG, ...","[M, TQ, R, AGAA, ML, PSA, LLLL, CV, PG, CL, TV...","[M, TQR, AGAA, ML, PSA, LLLL, CV, PG, CL, TV, ...","[M, TQR, AGAA, ML, PSA, LLLL, CVPG, CL, TVSG, ..."
...,...,...,...,...,...,...,...,...,...,...,...
43520,X6RJD6,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPAGLWTWLRNSLRPSP...,"[GO:0016209, GO:0016491]","[antioxidant activity, oxidoreductase activity]","[molecular_function, molecular_function]","[M, SR, S, LLL, W, FLL, FLL, LL, PPL, PV, LL, ...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, ...","[MSR, SLLL, WFLL, FLLLL, PPL, PVLL, AD, PGA, P...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PV, LL, AD...","[MSR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, PG...","[MSR, SLLL, WFLL, FLL, LL, PPL, PVLL, AD, PGA,..."
43521,X6RK96,TSVNYLDSAFRNIRNLGIVSVTSTDISSLYAKAQHVARRHYGCNIV...,"[GO:0140098, GO:0016740, GO:0003723, GO:0006399]","[catalytic activity, acting on RNA, transferas...","[molecular_function, molecular_function, molec...","[TSV, N, YL, D, SA, FR, NI, R, NLG, IV, SV, TS...","[TSV, NYL, DSA, FR, NI, R, NLG, IV, SV, TST, D...","[TSV, NYL, DSA, FRNI, R, NLG, IVSV, TST, DI, S...","[TSV, N, YL, D, SA, FR, NI, R, NLG, IV, SV, TS...","[TSV, NYL, DSA, FR, NI, R, NLG, IV, SV, TST, D...","[TSV, NYL, DSA, FRNI, R, NLG, IV, SV, TST, DI,..."
43522,X6RL45,MVRCYVEIVEKLPERRPDPATIEGCAQLKPNNYLLAWHTPFNEKGS...,[GO:0016787],[hydrolase activity],[molecular_function],"[MV, RC, YV, EIV, EKL, PE, RR, PD, PA, TI, EG,...","[MV, RC, YV, EIV, EKL, PE, RR, PD, PA, TI, EG,...","[MV, RC, YV, EIV, EKL, PERR, PD, PATI, EG, CA,...","[MV, RC, YV, EIV, EKL, PE, RR, PD, PA, TI, EG,...","[MV, RC, YV, EIV, EKL, PE, RR, PD, PA, TI, EG,...","[MV, RC, YV, EIV, EKL, PERR, PDPA, TIEG, CA, Q..."
43523,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...,[GO:0005829],[cytosol],[cellular_component],"[EV, KGL, FK, SEN, C, P, KVI, SC, E, FA, HN, S...","[EV, KGL, FK, SEN, C, P, KVI, SC, EFA, HN, SN,...","[EV, KGL, FK, SEN, C, PKVI, SC, EFA, HN, SN, W...","[EV, KGL, FK, SEN, C, P, KVI, SC, E, FA, HN, S...","[EV, KGL, FK, SEN, C, P, KVI, SC, EFA, HN, SNW...","[EV, KGL, FK, SEN, CP, KVI, SC, EFA, HN, SNW, ..."


In [124]:
df_protein_go = df_protein.explode(['go_slim_ids','go_slim_names','go_slim_types']).reset_index(drop=True)
df_protein_go = df_protein_go[df_protein_go['go_slim_types'] == 'cellular_component'].reset_index(drop=True)
df_protein_go

Unnamed: 0,uniprot_id,sequence,go_slim_ids,go_slim_names,go_slim_types,mutBPE blosum62 0.7 0.05 1600,mutBPE blosum62 0.7 0.05 6400,mutBPE blosum62 0.7 0.05 25600,mutBPE pre pam70 0.7 0.05 1600,mutBPE pre pam70 0.7 0.05 6400,mutBPE pre pam70 0.7 0.05 25600
0,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,GO:0043226,organelle,cellular_component,"[M, GL, EAL, V, PL, AMI, VA, IF, LLLV, DLM, H,...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DLM, HRH, Q...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DLM, HRH, Q...","[MGL, EAL, V, PL, AMI, VA, IF, LLLV, DL, MH, R...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DL, MH, RH,...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DL, MH, RH,..."
1,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,GO:0005739,mitochondrion,cellular_component,"[M, GL, EAL, V, PL, AMI, VA, IF, LLLV, DLM, H,...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DLM, HRH, Q...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DLM, HRH, Q...","[MGL, EAL, V, PL, AMI, VA, IF, LLLV, DL, MH, R...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DL, MH, RH,...","[MGL, EALV, PL, AMI, VA, IF, LLLV, DL, MH, RH,..."
2,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,GO:0043226,organelle,cellular_component,"[M, SR, S, LLL, W, FLL, FLL, LL, PPL, PV, LL, ...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, ...","[MSR, SLLL, WFLL, FLLLL, PPL, PVLL, AD, PGA, P...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PV, LL, AD...","[MSR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, PG...","[MSR, SLLL, WFLL, FLL, LL, PPL, PVLL, AD, PGA,..."
3,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,GO:0005794,Golgi apparatus,cellular_component,"[M, SR, S, LLL, W, FLL, FLL, LL, PPL, PV, LL, ...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, ...","[MSR, SLLL, WFLL, FLLLL, PPL, PVLL, AD, PGA, P...","[M, SR, SLLL, W, FLL, FLL, LL, PPL, PV, LL, AD...","[MSR, SLLL, W, FLL, FLL, LL, PPL, PVLL, AD, PG...","[MSR, SLLL, WFLL, FLL, LL, PPL, PVLL, AD, PGA,..."
4,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,GO:0005739,mitochondrion,cellular_component,"[M, FR, RL, T, FA, QLL, FA, T, VLG, IA, GGV, Y...","[M, FR, RL, TFA, QLL, FA, T, VLG, IA, GGV, YI,...","[M, FRRL, TFA, QLL, FAT, VLG, IA, GGV, YI, FQ,...","[M, FR, RL, T, FA, QLL, FA, TVL, G, IA, GGV, Y...","[M, FR, RL, TFA, QLL, FA, TVL, GIA, GGV, YI, F...","[M, FRRL, TFA, QLL, FA, TVL, GIA, GGV, YI, FQ,..."
...,...,...,...,...,...,...,...,...,...,...,...
52480,X6RC15,VLLKHQASINELKRTLKEPNSKLIHRDRDWERERRLPSSPASPSPK...,GO:0005856,cytoskeleton,cellular_component,"[VLL, KH, QA, SI, NEL, KR, TL, KE, PN, SKL, IH...","[VLL, KH, QA, SI, NEL, KR, TL, KE, PN, SKL, IH...","[VLL, KH, QASI, NEL, KRTL, KE, PN, SKL, IH, RD...","[V, LL, KH, QA, SI, NEL, KR, TL, KE, PN, SKL, ...","[V, LL, KH, QA, SI, NEL, KR, TL, KE, PN, SKL, ...","[V, LL, KH, QASI, NEL, KRTL, KE, PN, SKL, IH, ..."
52481,X6RH80,MEKRTCALCPKDVEYNVLYFAQSENIAAHENCLLYSSGLVECEDQD...,GO:0005654,nucleoplasm,cellular_component,"[M, EK, RT, C, AL, C, PK, DV, EY, NVL, Y, FA, ...","[MEK, RT, CAL, C, PK, DV, EY, NVL, Y, FA, Q, S...","[MEK, RT, CAL, C, PKDV, EY, NVL, YFA, Q, SENI,...","[ME, K, RT, CAL, C, PK, DV, EY, NVL, Y, FAQ, S...","[ME, KRT, CAL, C, PK, DV, EY, NVL, Y, FAQ, SEN...","[ME, KRT, CAL, C, PKDV, EY, NVL, Y, FAQ, SENI,..."
52482,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...,GO:0005829,cytosol,cellular_component,"[EV, KGL, FK, SEN, C, P, KVI, SC, E, FA, HN, S...","[EV, KGL, FK, SEN, C, P, KVI, SC, EFA, HN, SN,...","[EV, KGL, FK, SEN, C, PKVI, SC, EFA, HN, SN, W...","[EV, KGL, FK, SEN, C, P, KVI, SC, E, FA, HN, S...","[EV, KGL, FK, SEN, C, P, KVI, SC, EFA, HN, SNW...","[EV, KGL, FK, SEN, CP, KVI, SC, EFA, HN, SNW, ..."
52483,X6RLR1,MAGLTDLQRLQARVEELERWVYGPGGARGSRKVADGLVKVQVALGN...,GO:0005829,cytosol,cellular_component,"[MA, GLT, DL, QRL, QA, RV, EEL, ER, WV, YG, PG...","[MA, GLT, DL, QRL, QA, RV, EEL, ER, WV, YG, PG...","[MA, GLT, DL, QRL, QARV, EEL, ER, WV, YG, PGG,...","[MA, GL, TDL, QRL, QA, RV, EEL, ER, WV, YG, PG...","[MA, GL, TDL, QRL, QA, RV, EEL, ER, WV, YG, PG...","[MAGL, TDL, QRL, QARV, EEL, ER, WV, YG, PGG, A..."


In [127]:
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction

import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, LabelBinarizer
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
import matplotlib.pyplot as plt
from collections import Counter
from typing import List, Dict, Tuple, Set, Any
import re

In [83]:
# Protein unitlerini "doküman" formatına dönüştürme
def create_unit_documents(df: pd.DataFrame, tokenizer_col: str) -> List[str]:
    """Her protein için tokenizer çıktılarını boşluklarla ayrılmış doküman formatına dönüştürür"""
    return df[tokenizer_col].apply(lambda units: ' '.join(units)).tolist()

# GO terimlerini etiketlere dönüştürme
def create_go_labels(df: pd.DataFrame, go_col: str = 'go_slim_ids') -> List[List[str]]:
    """Her protein için GO terimlerini liste formatında döndürür"""
    return df[go_col].tolist()

In [235]:
# BERTopic modeli oluşturma
def create_bertopic_model(documents: List[str], go_labels: List[str]) -> Tuple[BERTopic, np.ndarray]:
    """Manuel topic modelleme için BERTopic modeli oluşturur"""
    
    # GO terimleri için one-hot encoding
    lb = LabelBinarizer()
    go_binary = lb.fit_transform(go_labels)
    
    # CountVectorizer ayarları - protein unit'ler için
    vectorizer_model = CountVectorizer(
        lowercase=False,
        token_pattern=r"(?u)\b\w\w\w\w+\b",
        stop_words=None,  # Protein unit'lerde stop word kullanmıyoruz
        ngram_range=(1, 2),  # Tek ve çift protein unit'leri de değerlendirelim
        min_df=5,  # En az 5 proteinde görünen unit'leri alalım
        max_df=0.7,  # Proteinlerin %70'inden fazlasında görünenleri çıkaralım
    )
    
    # Protein unit'lerin önemi için özel TF-IDF
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    
    # Terimlerin temsil edilmesi için model
    representation_model = KeyBERTInspired()
    
    empty_embedding_model = BaseEmbedder()
    empty_dimensionality_model = BaseDimensionalityReduction()
    empty_cluster_model = BaseCluster()

    # BERTopic model oluşturma - embedding modeli belirtmiyoruz (manuel topic modelleme)
    topic_model = BERTopic(
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        embedding_model=empty_embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        # representation_model=representation_model,
        # min_topic_size=10,  # En az 10 protein içeren topic'leri kabul edelim
        calculate_probabilities=True,
        verbose=True
    )
    
    # Topic'leri manuel olarak GO terimleriyle eşleştir
    topics = np.argmax(go_binary, axis=1)
    
    # Eğer bir protein için hiçbir GO terimi yoksa, -1 (outlier) olarak işaretle
    topics[np.sum(go_binary, axis=1) == 0] = -1
    
    # Topic isimlerini go_slim_ids değerleriyle eşleştir
    topic_labels = {i: lb.classes_[i] for i in range(len(lb.classes_))}
    topic_labels[-1] = "Outlier"  # Outlier topic'i için etiket
    
    # Belgeleri ve topic'leri kullanarak modeli eğit
    topic_model.fit_transform(documents, y=topics)
    topic_model.set_topic_labels(topic_labels)
    
    return topic_model, topics

In [342]:
# Ana çalıştırma fonksiyonu
def main(df: pd.DataFrame, tokenizer_col: str = 'tokenizer1', go_col: str = 'go_slim_ids'):
    """Protein unit'lerinin GO terimleri ile ilişkisini analiz eden ana fonksiyon"""
    
    print(f"Tokenizer kullanılıyor: {tokenizer_col}")
    documents = create_unit_documents(df, tokenizer_col)
    go_labels = create_go_labels(df, go_col)
    
    print("BERTopic modeli oluşturuluyor...")
    topic_model, topics = create_bertopic_model(documents, go_labels)
    
    # print("Protein unit'leri analiz ediliyor...")
    # unit_topic_df = analyze_protein_units(topic_model, df, tokenizer_col)
    
    # print("Protein unit family'leri analiz ediliyor...")
    # family_topic_df = analyze_unit_families(df, unit_topic_df, tokenizer_col)
    
    # print("Sonuçlar görselleştiriliyor...")
    # visualize_results(topic_model, unit_topic_df, family_topic_df)
    
    # # Sonuçları kaydet
    # topic_model.save("protein_unit_topic_model")
    # unit_topic_df.to_csv("protein_unit_topic_analysis.csv", index=False)
    # family_topic_df.to_csv("protein_unit_family_analysis.csv", index=False)
    
    # print("Analiz tamamlandı. Sonuçlar dosyalara kaydedildi.")
    
    return topic_model #, unit_topic_df, family_topic_df

In [343]:
# İki farklı tokenizer için analiz yapılabilir
# Tokenizer 1 kullanarak analiz
# topic_model1, unit_df1, family_df1 = main(df_protein_go, 'mutBPE blosum62 0.7 0.05 6400')
topic_model1 = main(df_protein_go, 'mutBPE blosum62 0.7 0.05 6400', 'go_slim_names')

Tokenizer kullanılıyor: mutBPE blosum62 0.7 0.05 6400
BERTopic modeli oluşturuluyor...


2025-04-28 22:43:35,292 - BERTopic - Embedding - Transforming documents to embeddings.
2025-04-28 22:43:35,294 - BERTopic - Embedding - Completed ✓
2025-04-28 22:43:35,294 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-28 22:43:35,295 - BERTopic - Dimensionality - Completed ✓
2025-04-28 22:43:35,295 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-28 22:43:35,298 - BERTopic - Cluster - Completed ✓
2025-04-28 22:43:35,304 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-28 22:43:36,349 - BERTopic - Representation - Completed ✓


In [344]:
topic_model1.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,0,9177,0_CGKAF_CGKA_HTGEKPYEC_TGEKPY,Golgi apparatus,"[CGKAF, CGKA, HTGEKPYEC, TGEKPY, HTGEKPYKC, HQ...",[MAL TQG PL TF RDV AI EF SQ EEW KSL DPV QK AL ...
1,1,7137,1_TPML_PKML_NGAG_NPIL,chromosome,"[TPML, PKML, NGAG, NPIL, SFRL, TRSR, AIAL, SLA...",[MD P SGV KVL ETA ED IQ ERR QQVL DRY HRF KEL S...
2,2,6595,2_HRDI_QLLL KELL_EKEQ_GGGG GGGG,cilium,"[HRDI, QLLL KELL, EKEQ, GGGG GGGG, KLLG, NRRI,...",[MSG TSS H ESF YD SL SDM QEE SKN TD FF PGL SA ...
3,3,5291,3_SRSR SRSR_GGGG GGGG_QQQQQQQQ QQQQQQQQ_ERER ERER,cytoplasmic vesicle,"[SRSR SRSR, GGGG GGGG, QQQQQQQQ QQQQQQQQ, ERER...",[MA SNSS SC PTP GGG HL NGY PVPP YA FF F PPML G...
4,4,4776,4_EADI_QELM_GGGG GGGG_EQEE EQEE,cytoskeleton,"[EADI, QELM, GGGG GGGG, EQEE EQEE, HRDI, SVIG,...",[M HV SLA EAL EV RGG PL QEE EI WA VL NQ SA ESL...
5,5,3228,5_GLPG GLPG_DSSDSS_GPPG GLPG_GLPG GPPG,cytosol,"[GLPG GLPG, DSSDSS, GPPG GLPG, GLPG GPPG, TTTT...",[MA VL PGPL QLLG VLL TI SL SSI RL IQ AG AY YGI...
6,6,2314,6_VAGG_PNAG_SDAA_SLAA SIIL,endoplasmic reticulum,"[VAGG, PNAG, SDAA, SLAA SIIL, EEEQ ERKL, QEEL ...",[MN RI RI HVL PT NRG RI TPV PR SQE PL SC A FT ...
7,7,1740,7_GLPG GLPG_GPPG GLPG_GLPG GPPG_PPGP GLPG,endosome,"[GLPG GLPG, GPPG GLPG, GLPG GPPG, PPGP GLPG, G...",[MKL RGV SLAA GL FLL ALSL WG QPA EAAA C YG C S...
8,8,1598,8_NRRI_SIVG RGIL_TTTT EKRI_SELG ELHK,extracellular matrix,"[NRRI, SIVG RGIL, TTTT EKRI, SELG ELHK, RGIL P...",[MR RLI C KRI C DY KSF DDEE SV DGN RP SS AASA ...
9,9,1460,9_SPAL PPPG_PEIM PAII_PALF PEIM_KKRR PALF,extracellular region,"[SPAL PPPG, PEIM PAII, PALF PEIM, KKRR PALF, P...",[MD EPP FSE AAL EQ ALG EPC DL DAA LL TDI EDML ...


In [345]:
topic_model1.topic_representations_

{0: [('CGKAF', np.float64(0.3463076858481108)),
  ('CGKA', np.float64(0.3348191366438999)),
  ('HTGEKPYEC', np.float64(0.3224021807470337)),
  ('TGEKPY', np.float64(0.31916907925212845)),
  ('HTGEKPYKC', np.float64(0.31847171475906694)),
  ('HQRI', np.float64(0.31517321017923844)),
  ('CGKSF', np.float64(0.314599870589447)),
  ('HQRT', np.float64(0.29881260559962924)),
  ('HTGEKPY', np.float64(0.2811131761343044)),
  ('HTGEKPYVC', np.float64(0.27815581368252135))],
 1: [('TPML', np.float64(0.21903793480455916)),
  ('PKML', np.float64(0.19939388090986812)),
  ('NGAG', np.float64(0.17054125773736714)),
  ('NPIL', np.float64(0.16817532662125204)),
  ('SFRL', np.float64(0.16566632362134345)),
  ('TRSR', np.float64(0.16055364149553863)),
  ('AIAL', np.float64(0.15913161543911797)),
  ('SLAC', np.float64(0.15830331126842567)),
  ('HRDL PEAL', np.float64(0.15657113359510524)),
  ('SKAA', np.float64(0.15594768133745157))],
 2: [('HRDI', np.float64(0.16383644719093549)),
  ('QLLL KELL', np.floa

In [346]:
topic_model1.get_topic(20)

[('PPSA PPSA', np.float64(0.5799589434970324)),
 ('SSSQ NAAL', np.float64(0.3528571413982184)),
 ('NAAL EEEL', np.float64(0.3468734769015095)),
 ('EALL STAL', np.float64(0.33668074172799134)),
 ('EEEE KKII', np.float64(0.33668074172799134)),
 ('SLAG EERG', np.float64(0.2936462054311537)),
 ('KKKR KKKK', np.float64(0.2936462054311537)),
 ('QEEI EEEI', np.float64(0.2936462054311537)),
 ('RRAC', np.float64(0.29351721798730523)),
 ('EEEL QEEI', np.float64(0.28810664950757653))]

In [347]:
topic_model1.get_document_info(create_unit_documents(df_protein_go, 'mutBPE blosum62 0.7 0.05 6400'))

Unnamed: 0,Document,Topic,Name,CustomName,Representation,Representative_Docs,Top_n_words,Representative_document
0,MGL EALV PL AMI VA IF LLLV DLM HRH QR WAA RY P...,4,4_EADI_QELM_GGGG GGGG_EQEE EQEE,cytoskeleton,"[EADI, QELM, GGGG GGGG, EQEE EQEE, HRDI, SVIG,...",[M HV SLA EAL EV RGG PL QEE EI WA VL NQ SA ESL...,EADI - QELM - GGGG GGGG - EQEE EQEE - HRDI - S...,False
1,MGL EALV PL AMI VA IF LLLV DLM HRH QR WAA RY P...,6,6_VAGG_PNAG_SDAA_SLAA SIIL,endoplasmic reticulum,"[VAGG, PNAG, SDAA, SLAA SIIL, EEEQ ERKL, QEEL ...",[MN RI RI HVL PT NRG RI TPV PR SQE PL SC A FT ...,VAGG - PNAG - SDAA - SLAA SIIL - EEEQ ERKL - Q...,False
2,M SR SLLL W FLL FLL LL PPL PVLL AD PGA PTPV NP...,4,4_EADI_QELM_GGGG GGGG_EQEE EQEE,cytoskeleton,"[EADI, QELM, GGGG GGGG, EQEE EQEE, HRDI, SVIG,...",[M HV SLA EAL EV RGG PL QEE EI WA VL NQ SA ESL...,EADI - QELM - GGGG GGGG - EQEE EQEE - HRDI - S...,False
3,M SR SLLL W FLL FLL LL PPL PVLL AD PGA PTPV NP...,10,10_PPGM HRDL_SDLV PPGM_SSTA SDLV_EKAL ESKL,extracellular space,"[PPGM HRDL, SDLV PPGM, SSTA SDLV, EKAL ESKL, E...",[M PSS SDT AL GGGG GL SW A EKKL EERR KRR RFL S...,PPGM HRDL - SDLV PPGM - SSTA SDLV - EKAL ESKL ...,False
4,M FR RL TFA QLL FA T VLG IA GGV YI FQ PVF EQ Y...,6,6_VAGG_PNAG_SDAA_SLAA SIIL,endoplasmic reticulum,"[VAGG, PNAG, SDAA, SLAA SIIL, EEEQ ERKL, QEEL ...",[MN RI RI HVL PT NRG RI TPV PR SQE PL SC A FT ...,VAGG - PNAG - SDAA - SLAA SIIL - EEEQ ERKL - Q...,False
...,...,...,...,...,...,...,...,...
52480,VLL KH QA SI NEL KR TL KE PN SKL IH RD RDW ERE...,8,8_NRRI_SIVG RGIL_TTTT EKRI_SELG ELHK,extracellular matrix,"[NRRI, SIVG RGIL, TTTT EKRI, SELG ELHK, RGIL P...",[MR RLI C KRI C DY KSF DDEE SV DGN RP SS AASA ...,NRRI - SIVG RGIL - TTTT EKRI - SELG ELHK - RGI...,False
52481,MEK RT CAL C PK DV EY NVL Y FA Q S ENI AA HEN ...,3,3_SRSR SRSR_GGGG GGGG_QQQQQQQQ QQQQQQQQ_ERER ERER,cytoplasmic vesicle,"[SRSR SRSR, GGGG GGGG, QQQQQQQQ QQQQQQQQ, ERER...",[MA SNSS SC PTP GGG HL NGY PVPP YA FF F PPML G...,SRSR SRSR - GGGG GGGG - QQQQQQQQ QQQQQQQQ - ER...,False
52482,EV KGL FK SEN C P KVI SC EFA HN SN WY IT FQ SD...,2,2_HRDI_QLLL KELL_EKEQ_GGGG GGGG,cilium,"[HRDI, QLLL KELL, EKEQ, GGGG GGGG, KLLG, NRRI,...",[MSG TSS H ESF YD SL SDM QEE SKN TD FF PGL SA ...,HRDI - QLLL KELL - EKEQ - GGGG GGGG - KLLG - N...,False
52483,MA GLT DL QRL QA RV EEL ER WV YG PGG ARG SR KV...,2,2_HRDI_QLLL KELL_EKEQ_GGGG GGGG,cilium,"[HRDI, QLLL KELL, EKEQ, GGGG GGGG, KLLG, NRRI,...",[MSG TSS H ESF YD SL SDM QEE SKN TD FF PGL SA ...,HRDI - QLLL KELL - EKEQ - GGGG GGGG - KLLG - N...,False


In [348]:
print(topic_model1.get_topic_tree(topic_model1.hierarchical_topics(create_unit_documents(df_protein_go, 'mutBPE blosum62 0.7 0.05 6400')), tight_layout=True))

100%|██████████| 24/24 [00:00<00:00, 357.13it/s]

.
├─GLPG GLPG_GPPG GLPG_GLPG GPPG_GFPG_GLPG PPGP
│ ├─RPIL GLPA_HRDL PEAL_DDQD_ESVV_ESVV RPIL
│ │ ├─EAEA ERLL_SPEP EAEA_DDQD EAEA_DDQD_SQSS RKRI
│ │ │ ├─■──SPEP EAEA_EAEA ERLL_RKRI EDAL_PSDL RKRI_ERLL DEEL ── Topic: 23
│ │ │ └─DDQD EAEA_SQSS RKRI_NLRL EAAV_EAEA EAKA_EGIL NLRL
│ │ │   ├─■──DEKL EEEM_PPSG DEKL_SLLL DVSS_AAAV PPSG_RTVL SGEG ── Topic: 18
│ │ │   └─DDQD EAEA_EAEA EAKA_EAKA ELKK_RPIL GLPA_HRDL PEAL
│ │ │     ├─■──EAEA EAKA_SIIL EEEQ_EEEQ ERKL_SLAA SIIL_DDQD EAEA ── Topic: 14
│ │ │     └─■──RPIL GLPA_HRDL PEAL_KDLL KKFL_PASV EEEF_EEAG PASV ── Topic: 11
│ │ └─VLVA KEVL_RKRM_ESVV RPIL_NLLG_RPIL GLPA
│ │   ├─■──PPGM HRDL_SDLV PPGM_SSTA SDLV_EKAL ESKL_EDLL QAEL ── Topic: 10
│ │   └─■──SPAL PPPG_PEIM PAII_PALF PEIM_KKRR PALF_PAII SQLL ── Topic: 9
│ └─GLPG GLPG_GPPG GLPG_GLPG GPPG_GFPG_GLPG PPGP
│   ├─GLPG GLPG_GPPG GLPG_GLPG GPPG_GFPG_GLPG PPGP
│   │ ├─■──GLPG GLPG_GPPG GLPG_GLPG GPPG_PPGP GLPG_GFPG ── Topic: 7
│   │ └─■──GLPG GLPG_DSSDSS_GPPG GLPG_GLPG GPPG_TTTT TTTT ── Topic: 5
│




In [None]:
topic_model1.vectorizer_model.get_feature_names_out()

array(['AAAA AAAG', 'AAAA AAAL', 'AAAA AAAT', ..., 'YRDL PEVV',
       'YRDL PILL', 'YRDL SAGL'], shape=(19514,), dtype=object)

In [None]:
np.where(topic_model1.vectorizer_model.get_feature_names_out() == 'TPML')

(array([19054]),)

In [None]:
topic_model1.get_topic(1)

[('TPML', np.float64(0.21903793480455916)),
 ('PKML', np.float64(0.19939388090986812)),
 ('NGAG', np.float64(0.17054125773736714)),
 ('NPIL', np.float64(0.16817532662125204)),
 ('SFRL', np.float64(0.16566632362134345)),
 ('TRSR', np.float64(0.16055364149553863)),
 ('AIAL', np.float64(0.15913161543911797)),
 ('SLAC', np.float64(0.15830331126842567)),
 ('HRDL PEAL', np.float64(0.15657113359510524)),
 ('SKAA', np.float64(0.15594768133745157))]

In [376]:
topic_model1.c_tf_idf_.todense()[:,19054]

matrix([[0.08146518],
        [0.21903793],
        [0.08162293],
        [0.07216195],
        [0.06379468],
        [0.09807832],
        [0.15002979],
        [0.05963285],
        [0.0635555 ],
        [0.        ],
        [0.        ],
        [0.13571916],
        [0.        ],
        [0.        ],
        [0.06019114],
        [0.08832972],
        [0.        ],
        [0.        ],
        [0.10067918],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.18339134]])