# Pre-tokenized Dataset Generation

In NLP, breaking text into smaller, meaningful pieces (pre-tokenization) is super important. In languages like English, spaces and punctuation naturally separate words, making it easy to split text into understandable chunks. This helps keep the tokens aligned with how we naturally use language.

But protein sequences don’t have these built-in separation points, which makes things tricky. A common approach is to treat an entire protein sequence as one long string—like a really long word with no spaces. While simple, this method ignores the important structural and functional details that matter in biology.

To fix this, we use a different pre-tokenization method by splitting protein sequences at domain boundaries. Just like words and phrases carry meaning in language, protein domains are biologically meaningful sections. So, by cutting sequences at these points, we create segments that better reflect real biological structures and functions. If different InterPro sources had conflicting domain annotations for the same protein, we picked the set that covered the most amino acids without overlaps.

In [119]:
import sqlite3
import pandas as pd
from tqdm import tqdm
import numpy as np

In [None]:
# Connect to DB
db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)
df_interpro_domain = pd.read_sql(f"SELECT uniprot_id, interpro_id, description, start_index, end_index FROM interpro_entries_v2 WHERE type='domain'", conn)
df_interpro_domain

Unnamed: 0,uniprot_id,interpro_id,description,start_index,end_index
0,B4DNH0,IPR002126,Cadherin-like,53,203
1,B4DNH0,IPR002126,Cadherin-like,205,329
2,B4DNH0,IPR002126,Cadherin-like,319,429
3,B4DNH0,IPR014868,Cadherin prodomain,27,116
4,P49257,IPR005052,Legume-like lectin,44,268
...,...,...,...,...,...
491566,A0A0A7C699,IPR001039,"MHC class I alpha chain, alpha1 alpha2 domains",58,87
491567,A0A0A7C699,IPR001039,"MHC class I alpha chain, alpha1 alpha2 domains",90,106
491568,A0A0A7C699,IPR001039,"MHC class I alpha chain, alpha1 alpha2 domains",111,128
491569,A0A0A7C699,IPR001039,"MHC class I alpha chain, alpha1 alpha2 domains",156,174


In [121]:
df_ted = pd.read_sql(f"SELECT * FROM ted_entries_summary", conn)
# df_ted = df_ted[df_ted['consensus_level'] == 'high']
df_ted = df_ted[df_ted['plddt'] >= 70]
df_ted

Unnamed: 0,uniprot_id,ted_id,consensus_level,plddt,cath_label,start_index,end_index
0,A0A024R1X5,AF-A0A024R1X5-F1-model_v4_TED01,high,83.2282,1.10.418.40,278,448
1,A0A024R274,AF-A0A024R274-F1-model_v4_TED01,high,93.7552,3.90.520.10,15,134
2,A0A024R274,AF-A0A024R274-F1-model_v4_TED02,high,95.0662,2.60.200.10,288,296
3,A0A024R274,AF-A0A024R274-F1-model_v4_TED02,high,95.0662,2.60.200.10,315,442
4,A0A024R274,AF-A0A024R274-F1-model_v4_TED02,high,95.0662,2.60.200.10,494,539
...,...,...,...,...,...,...,...
288512,X6RLL4,AF-X6RLL4-F1-model_v4_TED02,medium,88.1399,-,149,296
288513,X6RLN4,AF-X6RLN4-F1-model_v4_TED01,high,92.1171,-,3,54
288514,X6RLR1,AF-X6RLR1-F1-model_v4_TED01,medium,81.0586,1.20.5,118,168
288515,X6RLY7,AF-X6RLY7-F1-model_v4_TED01,high,92.0818,3.30.450,33,89


In [122]:
conn.close()

## Find the longest domain combination for each protein that do not overlap

In [6]:
# Function to find optimal domain combination for each protein
def find_optimal_domains(df):
    result = []
    
    # Group by uniprot_id
    for uniprot_id, group in tqdm(df.groupby('uniprot_id')):
        # Sort by start_index, and then by end_index
        group = group.sort_values(by=['start_index', 'end_index']).reset_index(drop=True)

        # Initialize variables for the dynamic programming approach
        n = len(group)
        dp = [0] * n
        prev = [-1] * n

        # Fill the dp array
        for i in range(n):
            dp[i] = group.loc[i, 'end_index'] - group.loc[i, 'start_index'] + 1
            for j in range(i):
                if group.loc[j, 'end_index'] < group.loc[i, 'start_index']:  # Non-overlapping condition
                    if dp[j] + (group.loc[i, 'end_index'] - group.loc[i, 'start_index'] + 1) > dp[i]:
                        dp[i] = dp[j] + (group.loc[i, 'end_index'] - group.loc[i, 'start_index'] + 1)
                        prev[i] = j

        # Reconstruct the optimal solution
        max_index = dp.index(max(dp))
        selected_domains = []
        while max_index != -1:
            selected_domains.append(group.loc[max_index, 'domain_index'])
            max_index = prev[max_index]

        selected_domains.reverse()

        # Append the result for the current protein
        result.append({'uniprot_id': uniprot_id, 'selected_domains': selected_domains})

    # Convert result to DataFrame
    return pd.DataFrame(result)

# Define the input DataFrame
# data = {
#     'uniprot_id': ["A0A292FP66", "A0A292FP66", "A0A292FP66", "A0A292FP66", "A0A292FP66", "A0A292FP66", "A0A292FP66",
#                    "A0A7I2V505", "A0A7I2V505", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0"],
#     'start_index': [58, 90, 111, 156, 194, 184, 1, 21, 296, 83, 115, 136, 181, 218, 209, 337, 25],
#     'end_index': [87, 106, 128, 174, 268, 272, 178, 266, 454, 112, 131, 153, 199, 293, 220, 364, 203]
# }
# df = pd.DataFrame(data)

# df.groupby("uniprot_id").apply(lambda x: x.sort_values(by=["start_index", 'end_index']))

# Add a column for domain indices
# df['domain_index'] = df.index
# optimal_domains_df = find_optimal_domains(df)
# optimal_domains_df

In [7]:
df_interpro_domain['domain_index'] = df_interpro_domain.index
df_interpro_optimal_domains = find_optimal_domains(df_interpro_domain)
df_interpro_optimal_domains

100%|██████████| 144152/144152 [02:04<00:00, 1160.99it/s]


Unnamed: 0,uniprot_id,selected_domains
0,A0A023HJ61,[266181]
1,A0A023I7F4,"[33061, 33062]"
2,A0A023I7H2,"[272139, 272140, 272141]"
3,A0A023I7J4,"[62472, 62473]"
4,A0A023I7N8,"[323608, 323609, 323610]"
...,...,...
144147,X6RLJ0,[382494]
144148,X6RLP6,[142406]
144149,X6RLU5,[130480]
144150,X6RLX0,[53610]


In [8]:
df_interpro_optimal_domains = df_interpro_domain.loc[list(df_interpro_optimal_domains['selected_domains'].explode())].drop(columns=['domain_index'])
df_interpro_optimal_domains

Unnamed: 0,uniprot_id,interpro_id,description,start_index,end_index
266181,A0A023HJ61,IPR005225,Small GTP-binding domain,15,97
33061,A0A023I7F4,IPR005797,"Cytochrome b/b6, N-terminal domain",1,209
33062,A0A023I7F4,IPR005798,"Cytochrome b/b6, C-terminal",210,380
272139,A0A023I7H2,IPR001516,"NADH-Ubiquinone oxidoreductase (complex I), ch...",68,118
272140,A0A023I7H2,IPR001750,"NADH:quinone oxidoreductase/Mrp antiporter, tr...",134,418
...,...,...,...,...,...
382494,X6RLJ0,IPR001073,C1q domain,108,220
142406,X6RLP6,IPR000504,RNA recognition motif domain,27,102
130480,X6RLU5,IPR013680,"Voltage-dependent calcium channel, alpha-2/del...",111,189
53610,X6RLX0,IPR019018,Rab-binding domain FIP-RBD,1050,1112


In [9]:
# df_ted['domain_index'] = df_ted.index
# df_ted_optimal_domains = find_optimal_domains(df_ted)
# df_ted_optimal_domains

In [10]:
# df_ted_optimal_domains = df_ted.loc[list(df_ted_optimal_domains['selected_domains'].explode())].drop(columns=['domain_index'])
# df_ted_optimal_domains

In [None]:
# Because there is no overlap in TED entries, we can skip the above 2 steps.
df_ted_optimal_domains = df_ted.copy()

In [12]:
def check_overlap(df):
    is_overlap = []
    for uniprot_id, group in tqdm(df.groupby('uniprot_id')):
        return_value = True
        group = group.sort_values(by=['start_index', 'end_index']).reset_index(drop=True)
        last_end = -1
        
        for _, row in group.iterrows():
            if row["start_index"] > last_end:
                last_end = row["end_index"]
            else:
                return_value = False
                break
        is_overlap.append(return_value)
    return is_overlap

In [13]:
# checker = check_overlap(df_interpro_optimal_domains)
# len(checker) == sum(checker)

In [14]:
# checker = check_overlap(df_ted_optimal_domains)
# len(checker) == sum(checker)

## Pre-tokenize (Split) protein sequences based on domain intervals

In [None]:
uniref_id = '90' # 50 or 90

db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)
df_proteins = pd.read_sql(f"SELECT Entry as uniprot_id, Sequence as sequence FROM proteins", conn)
df_uniref = pd.read_sql(f"SELECT uniprot_accession as uniprot_id FROM uniref{uniref_id}_distilled", conn)

conn.close()

In [160]:
df_uniref = df_uniref.set_index('uniprot_id').join(df_proteins.set_index('uniprot_id')).reset_index()

In [161]:
df_interpro_domains_grouped = df_interpro_optimal_domains.sort_values(['uniprot_id', 'start_index']).groupby('uniprot_id').agg(list).reset_index()
df_interpro_domains_grouped['interpro_domain_boundaries'] = df_interpro_domains_grouped.apply(
    lambda row: [(s,e) for s, e in zip(row['start_index'], row['end_index'])], axis=1)
df_interpro_domains_grouped = df_interpro_domains_grouped.drop(columns=['start_index', 'end_index'])
df_interpro_domains_grouped = df_interpro_domains_grouped.set_index('uniprot_id').join(df_proteins.set_index('uniprot_id')).reset_index()
df_interpro_domains_grouped['sequence_len'] = df_interpro_domains_grouped['sequence'].str.len()
df_interpro_domains_grouped

Unnamed: 0,uniprot_id,interpro_id,description,interpro_domain_boundaries,sequence,sequence_len
0,A0A023HJ61,[IPR005225],[Small GTP-binding domain],"[(15, 97)]",MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI...,121
1,A0A023I7F4,"[IPR005797, IPR005798]","[Cytochrome b/b6, N-terminal domain, Cytochrom...","[(1, 209), (210, 380)]",MTPMRKINPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQIT...,380
2,A0A023I7H2,"[IPR001516, IPR001750, IPR010934]","[NADH-Ubiquinone oxidoreductase (complex I), c...","[(68, 118), (134, 418), (422, 602)]",MTMHTTMTTLTLTSLIPPILTTLVNPNKKNSYPHYVKSIVASTFII...,603
3,A0A023I7J4,"[IPR001750, IPR010933]","[NADH:quinone oxidoreductase/Mrp antiporter, t...","[(23, 285), (290, 344)]",MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTK...,347
4,A0A023I7N8,"[IPR001516, IPR001750, IPR010934]","[NADH-Ubiquinone oxidoreductase (complex I), c...","[(68, 118), (134, 419), (422, 602)]",MTMHTTMTTLTLTSLIPPILTTLVNPNKKNSYPHYVKSIVASTFII...,603
...,...,...,...,...,...,...
144147,X6RLJ0,[IPR001073],[C1q domain],"[(108, 220)]",MEGPRGWLVLCVLAISLASMVTEDLCRAPDGKKGEAGRPGRRGRPG...,220
144148,X6RLP6,[IPR000504],[RNA recognition motif domain],"[(27, 102)]",MAAPEQPLAISRGCTSSSSLSPPRGDRTLLVRHLPAELTAEEKEDL...,185
144149,X6RLU5,[IPR013680],"[Voltage-dependent calcium channel, alpha-2/de...","[(111, 189)]",XELVREVLFDAVVTAPMEAYWTALALNMSEESEHVVDMAFLGTRAG...,192
144150,X6RLX0,[IPR019018],[Rab-binding domain FIP-RBD],"[(1050, 1112)]",MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...,1120


In [162]:
df_ted_domains_grouped = df_ted_optimal_domains.sort_values(['uniprot_id', 'start_index']).groupby('uniprot_id').agg(list).reset_index()
df_ted_domains_grouped['ted_domain_boundaries'] = df_ted_domains_grouped.apply(
    lambda row: [(s,e) for s, e in zip(row['start_index'], row['end_index'])], axis=1)
df_ted_domains_grouped = df_ted_domains_grouped.drop(columns=['start_index', 'end_index'])
df_ted_domains_grouped = df_ted_domains_grouped.set_index('uniprot_id').join(df_proteins.set_index('uniprot_id')).reset_index()
df_ted_domains_grouped['sequence_len'] = df_ted_domains_grouped['sequence'].str.len()
df_ted_domains_grouped

Unnamed: 0,uniprot_id,ted_id,consensus_level,plddt,cath_label,ted_domain_boundaries,sequence,sequence_len
0,A0A023HJ61,[AF-A0A023HJ61-F1-model_v4_TED01],[high],[75.9913],[3.40.50.300],"[(18, 92)]",MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI...,121
1,A0A023I7F4,"[AF-A0A023I7F4-F1-model_v4_TED01, AF-A0A023I7F...","[medium, medium, medium, medium]","[98.0415, 98.0415, 98.0415, 97.8591]","[1.20.810.10, 1.20.810.10, 1.20.810.10, 1.10.287]","[(28, 102), (111, 203), (220, 269), (270, 380)]",MTPMRKINPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQIT...,380
2,A0A023I7H2,"[AF-A0A023I7H2-F1-model_v4_TED01, AF-A0A023I7H...","[medium, medium, medium]","[92.7556, 92.7556, 94.8363]","[-, -, 1.20.5.110]","[(2, 476), (483, 513), (514, 576)]",MTMHTTMTTLTLTSLIPPILTTLVNPNKKNSYPHYVKSIVASTFII...,603
3,A0A023I7H5,[AF-A0A023I7H5-F1-model_v4_TED01],[high],[84.2129],[-],"[(2, 226)]",MNENLFASFIAPTILGLPAAVLIILFPPLLIPTSKYLINNRLITTQ...,226
4,A0A023I7J4,"[AF-A0A023I7J4-F1-model_v4_TED01, AF-A0A023I7J...","[medium, medium]","[94.6769, 94.0559]","[-, -]","[(2, 124), (151, 347)]",MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTK...,347
...,...,...,...,...,...,...,...,...
150312,X6RLR1,[AF-X6RLR1-F1-model_v4_TED01],[medium],[81.0586],[1.20.5],"[(118, 168)]",MAGLTDLQRLQARVEELERWVYGPGGARGSRKVADGLVKVQVALGN...,176
150313,X6RLX0,"[AF-X6RLX0-F1-model_v4_TED01, AF-X6RLX0-F1-mod...","[medium, medium]","[85.947, 83.4708]","[1.20.5.340, -]","[(815, 952), (1011, 1120)]",MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...,1120
150314,X6RLY7,"[AF-X6RLY7-F1-model_v4_TED01, AF-X6RLY7-F1-mod...","[high, high]","[92.0818, 92.0818]","[3.30.450, 3.30.450]","[(33, 89), (155, 230)]",MKLEFLQRKFWAATRQCSTVDGPCTQSCEDSDLDCFVIDNNGFILI...,282
150315,X6RM00,"[AF-X6RM00-F1-model_v4_TED01, AF-X6RM00-F1-mod...","[medium, medium]","[75.4984, 82.0507]","[1.10.287.210, -]","[(146, 188), (814, 950)]",MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...,976


### Fill intervals between domains of a protein with ```out_of_domain``` tag

In [None]:
def fill_intervals(intervals, max_value):
    intervals = sorted(intervals)
    result = []
    if intervals[0][0] > 1:
        result.append((1, intervals[0][0] - 1))
    for i in range(len(intervals)):
        result.append(intervals[i])
        if i < len(intervals) - 1:
            if intervals[i][1] + 1 < intervals[i + 1][0]:
                result.append((intervals[i][1] + 1, intervals[i + 1][0] - 1))
    if intervals[-1][1] < max_value:
        result.append((intervals[-1][1] + 1, max_value))
    return result

def update_intervals(row, boundary_column, other_columns):
    filled_intervals = fill_intervals(row[boundary_column], row["sequence_len"])
    original_intervals = row[boundary_column]
    
    new_cols = {col: [] for col in other_columns}
    for interval in filled_intervals:
        if interval in original_intervals:
            index = original_intervals.index(interval)
            for col in other_columns:
                new_cols[col].append(row[col][index])
        else:
            for col in other_columns:
                new_cols[col].append('out_of_domain')
    
    row[boundary_column] = filled_intervals
    for col in other_columns:
        row[col] = new_cols[col]
    return row

# # Example dataframe
data = {
    "uniprot_id": ["A0A023HJ61", "A0A023I7F4"],
    "interpro_id": [["IPR005225"], ["IPR005797", "IPR005798"]],
    "description": [["Small GTP-binding domain"], ["Cytochrome b/b6, N-terminal domain", "Cytochrome b/b6, C-terminal domain"]],
    "interpro_domain_boundaries": [[(15, 97)], [(1, 209), (210, 380)]],
    "sequence_len": [121, 380],
}

df = pd.DataFrame(data)

# Apply the update function to the dataframe
df.apply(update_intervals, args=('interpro_domain_boundaries', ('interpro_id', 'description')), axis=1)



Unnamed: 0,uniprot_id,interpro_id,description,interpro_domain_boundaries,sequence_len
0,A0A023HJ61,"[out_of_domain, IPR005225, out_of_domain]","[out_of_domain, Small GTP-binding domain, out_...","[(1, 14), (15, 97), (98, 121)]",121
1,A0A023I7F4,"[IPR005797, IPR005798]","[Cytochrome b/b6, N-terminal domain, Cytochrom...","[(1, 209), (210, 380)]",380


In [164]:
df_interpro_domains_grouped_interp = df_interpro_domains_grouped.apply(update_intervals, args=('interpro_domain_boundaries', ('interpro_id', 'description')), axis=1)
df_ted_domains_grouped_interp = df_ted_domains_grouped.apply(update_intervals, args=('ted_domain_boundaries', ('ted_id', 'consensus_level', 'plddt', 'cath_label')), axis=1)

In [165]:
df_interpro_domains_grouped_interp = df_interpro_domains_grouped_interp.explode(['interpro_id', 'description', 'interpro_domain_boundaries']).reset_index(drop=True)
df_interpro_domains_grouped_interp['sequence_sliced'] = df_interpro_domains_grouped_interp.apply(lambda row: row['sequence'][row['interpro_domain_boundaries'][0]-1:row['interpro_domain_boundaries'][1]], axis=1)
df_interpro_domains_grouped_interp

Unnamed: 0,uniprot_id,interpro_id,description,interpro_domain_boundaries,sequence,sequence_len,sequence_sliced
0,A0A023HJ61,out_of_domain,out_of_domain,"(1, 14)",MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI...,121,MSQTAMSETYDFLF
1,A0A023HJ61,IPR005225,Small GTP-binding domain,"(15, 97)",MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI...,121,KFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTIGVEFGSKIINVGGK...
2,A0A023HJ61,out_of_domain,out_of_domain,"(98, 121)",MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI...,121,CPNASEPEHCDHPLWKQEGPGCRS
3,A0A023I7F4,IPR005797,"Cytochrome b/b6, N-terminal domain","(1, 209)",MTPMRKINPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQIT...,380,MTPMRKINPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQIT...
4,A0A023I7F4,IPR005798,"Cytochrome b/b6, C-terminal","(210, 380)",MTPMRKINPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQIT...,380,GITSHSDKITFHPYYTIKDALGLLLFLLSLMTLTLFSPDLLGDPDN...
...,...,...,...,...,...,...,...
585689,X6RLX0,IPR019018,Rab-binding domain FIP-RBD,"(1050, 1112)",MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...,1120,TPPASYNLDDDQAAWENELQKMTRGQLQDELEKGERDNAELQEFAN...
585690,X6RLX0,out_of_domain,out_of_domain,"(1113, 1120)",MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...,1120,VNALEESS
585691,X6RLY7,out_of_domain,out_of_domain,"(1, 2)",MKLEFLQRKFWAATRQCSTVDGPCTQSCEDSDLDCFVIDNNGFILI...,282,MK
585692,X6RLY7,IPR013680,"Voltage-dependent calcium channel, alpha-2/del...","(3, 255)",MKLEFLQRKFWAATRQCSTVDGPCTQSCEDSDLDCFVIDNNGFILI...,282,LEFLQRKFWAATRQCSTVDGPCTQSCEDSDLDCFVIDNNGFILISK...


In [166]:
df_ted_domains_grouped_interp = df_ted_domains_grouped_interp.explode(['ted_id', 'consensus_level', 'plddt', 'cath_label', 'ted_domain_boundaries']).reset_index(drop=True)
df_ted_domains_grouped_interp['sequence_sliced'] = df_ted_domains_grouped_interp.apply(lambda row: row['sequence'][row['ted_domain_boundaries'][0]-1:row['ted_domain_boundaries'][1]], axis=1)
df_ted_domains_grouped_interp

Unnamed: 0,uniprot_id,ted_id,consensus_level,plddt,cath_label,ted_domain_boundaries,sequence,sequence_len,sequence_sliced
0,A0A023HJ61,out_of_domain,out_of_domain,out_of_domain,out_of_domain,"(1, 17)",MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI...,121,MSQTAMSETYDFLFKFL
1,A0A023HJ61,AF-A0A023HJ61-F1-model_v4_TED01,high,75.9913,3.40.50.300,"(18, 92)",MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI...,121,VIGNAGTGKSCLLHQFIEKKFKDDSNHTIGVEFGSKIINVGGKYVK...
2,A0A023HJ61,out_of_domain,out_of_domain,out_of_domain,out_of_domain,"(93, 121)",MSQTAMSETYDFLFKFLVIGNAGTGKSCLLHQFIEKKFKDDSNHTI...,121,YDITRCPNASEPEHCDHPLWKQEGPGCRS
3,A0A023I7F4,out_of_domain,out_of_domain,out_of_domain,out_of_domain,"(1, 27)",MTPMRKINPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQIT...,380,MTPMRKINPLMKLINHSFIDLPTPSNI
4,A0A023I7F4,AF-A0A023I7F4-F1-model_v4_TED01,medium,98.0415,1.20.810.10,"(28, 102)",MTPMRKINPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQIT...,380,SAWWNFGSLLGACLILQITTGLFLAMQYSPDAWTAFSSIAHITRDV...
...,...,...,...,...,...,...,...,...,...
651173,X6RM59,out_of_domain,out_of_domain,out_of_domain,out_of_domain,"(1, 42)",MDRAAVARVGAVASASVCALVAGVVLAQYIFTLKRKTGRKTKIIEM...,331,MDRAAVARVGAVASASVCALVAGVVLAQYIFTLKRKTGRKTK
651174,X6RM59,AF-X6RM59-F1-model_v4_TED01,high,96.1562,3.40.50.1000,"(43, 94)",MDRAAVARVGAVASASVCALVAGVVLAQYIFTLKRKTGRKTKIIEM...,331,IIEMMPEFQKSSVRIKNPTRVEEIICGLIKGGAAKLQIITDFDMTL...
651175,X6RM59,out_of_domain,out_of_domain,out_of_domain,out_of_domain,"(95, 175)",MDRAAVARVGAVASASVCALVAGVVLAQYIFTLKRKTGRKTKIIEM...,331,GKRCPTCHNIIDNCKLVTDECRKKLLQLKEKYYAIEVDPVLTVEEK...
651176,X6RM59,AF-X6RM59-F1-model_v4_TED01,high,96.1562,3.40.50.1000,"(176, 329)",MDRAAVARVGAVASASVCALVAGVVLAQYIFTLKRKTGRKTKIIEM...,331,LKEGYENFFDKLQQHSIPVFIFSAGIGDVLEEVIRQAGVYHPNVKV...


In [167]:
uniprot_ids_interpro_set = set(df_interpro_domains_grouped_interp['uniprot_id'])
uniprot_ids_ted_set = set(df_ted_domains_grouped_interp['uniprot_id'])

### Combine InterPro and TED annotations. Use InterPro ove TED if both annotation exist for a protein.

In [None]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=20)

def combine_domain_slices(row):
    df_uniref_sequence_splits = []
    uniprot_id = row['uniprot_id']
    sequence = row['sequence']
    if uniprot_id in uniprot_ids_interpro_set:
        df_uniref_sequence_splits.extend(
            df_interpro_domains_grouped_interp[df_interpro_domains_grouped_interp['uniprot_id'] == uniprot_id][['uniprot_id', 'sequence_sliced', 'interpro_id']].rename(
                columns={'sequence_sliced':'sequence', 'interpro_id': 'source'}).to_dict('records')
        )
    elif uniprot_id in uniprot_ids_ted_set:
        df_uniref_sequence_splits.extend(
            df_ted_domains_grouped_interp[df_ted_domains_grouped_interp['uniprot_id'] == uniprot_id][['uniprot_id', 'sequence_sliced', 'ted_id']].rename(
                columns={'sequence_sliced':'sequence', 'ted_id': 'source'}).to_dict('records')
        )
    else:
        df_uniref_sequence_splits.append({
            'uniprot_id': uniprot_id,
            'sequence' : sequence,
            'source' : 'out_of_domain'
        })
    return df_uniref_sequence_splits

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [169]:
df_uniref_sliced = df_uniref.parallel_apply(combine_domain_slices, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4701), Label(value='0 / 4701'))), …

In [170]:
df_uniref_sliced = pd.DataFrame(list(df_uniref_sliced.explode().reset_index(drop=True)))
df_uniref_sliced

Unnamed: 0,uniprot_id,sequence,source
0,H7C0U7,MTTQA,out_of_domain
1,H7C0U7,PTFTQPLQSVVVLEGSTATFEAHISGFPVPEVSWFRDGQVISTSTL...,IPR013098
2,H7C0U7,KAETAP,out_of_domain
3,H7C0U7,PNFVQRLQSMTVRQGSQVRLQVRVTGIPTPVVKFYRDGAEIQSSLD...,IPR013098
4,H7C0U7,QGEEEVPAKKTKTIVSTAQISESRQTRIEKKIEAHFDARSIATVEM...,out_of_domain
...,...,...,...
323458,A0A3B3ITF9,MTLEEFSAGEQKTERIHIHLNGRILP,out_of_domain
323459,A0A075B6W7,XNAGNNRKLIWGLGTSLAVNP,out_of_domain
323460,C1KEM4,NVKSEGSGQRGGSMAVLVWLHM,out_of_domain
323461,A0A075B6U9,XNTGGTIDKLTFGKGTHVFIIS,out_of_domain


In [171]:
print(f"uniref{uniref_id}_domain_sliced_plddt70")

uniref90_domain_sliced_plddt70


In [173]:
db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)
df_uniref_sliced.to_sql(f"uniref{uniref_id}_domain_sliced_plddt70", conn, if_exists="replace", index=False)
conn.close()