In [1]:
import sqlite3
import pandas as pd
from tqdm import tqdm

## Handle Domains

In [2]:
# Connect to DB
db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)

In [3]:
df_proteins = pd.read_sql(f"SELECT Entry as uniprot_id FROM proteins", conn)
df_proteins

Unnamed: 0,uniprot_id
0,A0A024R1X5
1,A0A024R274
2,A0A024R324
3,A0A024R6A3
4,A0A024R7I7
...,...
205099,X6RLT1
205100,X6RLU5
205101,X6RLV5
205102,X6RLY7


In [6]:
df_interpro_domain = pd.read_sql(f"SELECT uniprot_id, interpro_id, description, start_index, end_index FROM interpro_entries_v2 WHERE type='domain'", conn)
df_interpro_domain

Unnamed: 0,uniprot_id,interpro_id,description,start_index,end_index
0,B4DNH0,IPR002126,Cadherin-like,53,203
1,B4DNH0,IPR002126,Cadherin-like,205,329
2,B4DNH0,IPR002126,Cadherin-like,319,429
3,B4DNH0,IPR014868,Cadherin prodomain,27,116
4,P49257,IPR005052,Legume-like lectin,44,268
...,...,...,...,...,...
491566,A0A0A7C699,IPR001039,"MHC class I alpha chain, alpha1 alpha2 domains",58,87
491567,A0A0A7C699,IPR001039,"MHC class I alpha chain, alpha1 alpha2 domains",90,106
491568,A0A0A7C699,IPR001039,"MHC class I alpha chain, alpha1 alpha2 domains",111,128
491569,A0A0A7C699,IPR001039,"MHC class I alpha chain, alpha1 alpha2 domains",156,174


In [9]:
df_ted = pd.read_sql(f"SELECT * FROM ted_entries_summary", conn)
# df_ted = df_ted[df_ted['consensus_level'] == 'high']
# df_ted = df_ted[df_ted['plddt'] >= 80]
df_ted

Unnamed: 0,uniprot_id,ted_id,consensus_level,plddt,cath_label,start_index,end_index
0,A0A024R1X5,AF-A0A024R1X5-F1-model_v4_TED01,high,83.2282,1.10.418.40,278,448
1,A0A024R274,AF-A0A024R274-F1-model_v4_TED01,high,93.7552,3.90.520.10,15,134
2,A0A024R274,AF-A0A024R274-F1-model_v4_TED02,high,95.0662,2.60.200.10,288,296
3,A0A024R274,AF-A0A024R274-F1-model_v4_TED02,high,95.0662,2.60.200.10,315,442
4,A0A024R274,AF-A0A024R274-F1-model_v4_TED02,high,95.0662,2.60.200.10,494,539
...,...,...,...,...,...,...,...
288512,X6RLL4,AF-X6RLL4-F1-model_v4_TED02,medium,88.1399,-,149,296
288513,X6RLN4,AF-X6RLN4-F1-model_v4_TED01,high,92.1171,-,3,54
288514,X6RLR1,AF-X6RLR1-F1-model_v4_TED01,medium,81.0586,1.20.5,118,168
288515,X6RLY7,AF-X6RLY7-F1-model_v4_TED01,high,92.0818,3.30.450,33,89


In [4]:
conn.close()

In [10]:
# Function to find optimal domain combination for each protein
def find_optimal_domains(df):
    result = []
    
    # Group by uniprot_id
    for uniprot_id, group in tqdm(df.groupby('uniprot_id')):
        # Sort by start_index, and then by end_index
        group = group.sort_values(by=['start_index', 'end_index']).reset_index(drop=True)

        # Initialize variables for the dynamic programming approach
        n = len(group)
        dp = [0] * n
        prev = [-1] * n

        # Fill the dp array
        for i in range(n):
            dp[i] = group.loc[i, 'end_index'] - group.loc[i, 'start_index'] + 1
            for j in range(i):
                if group.loc[j, 'end_index'] < group.loc[i, 'start_index']:  # Non-overlapping condition
                    if dp[j] + (group.loc[i, 'end_index'] - group.loc[i, 'start_index'] + 1) > dp[i]:
                        dp[i] = dp[j] + (group.loc[i, 'end_index'] - group.loc[i, 'start_index'] + 1)
                        prev[i] = j

        # Reconstruct the optimal solution
        max_index = dp.index(max(dp))
        selected_domains = []
        while max_index != -1:
            selected_domains.append(group.loc[max_index, 'domain_index'])
            max_index = prev[max_index]

        selected_domains.reverse()

        # Append the result for the current protein
        result.append({'uniprot_id': uniprot_id, 'selected_domains': selected_domains})

    # Convert result to DataFrame
    return pd.DataFrame(result)

# Define the input DataFrame
# data = {
#     'uniprot_id': ["A0A292FP66", "A0A292FP66", "A0A292FP66", "A0A292FP66", "A0A292FP66", "A0A292FP66", "A0A292FP66",
#                    "A0A7I2V505", "A0A7I2V505", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0", "A0A6B7HHU0"],
#     'start_index': [58, 90, 111, 156, 194, 184, 1, 21, 296, 83, 115, 136, 181, 218, 209, 337, 25],
#     'end_index': [87, 106, 128, 174, 268, 272, 178, 266, 454, 112, 131, 153, 199, 293, 220, 364, 203]
# }
# df = pd.DataFrame(data)

# df.groupby("uniprot_id").apply(lambda x: x.sort_values(by=["start_index", 'end_index']))

# Add a column for domain indices
# df['domain_index'] = df.index
# optimal_domains_df = find_optimal_domains(df)
# optimal_domains_df

In [11]:
df_interpro_domain['domain_index'] = df_interpro_domain.index
df_interpro_optimal_domains = find_optimal_domains(df_interpro_domain)
df_interpro_optimal_domains

100%|██████████| 144152/144152 [02:04<00:00, 1161.12it/s]


Unnamed: 0,uniprot_id,selected_domains
0,A0A023HJ61,[266181]
1,A0A023I7F4,"[33061, 33062]"
2,A0A023I7H2,"[272139, 272140, 272141]"
3,A0A023I7J4,"[62472, 62473]"
4,A0A023I7N8,"[323608, 323609, 323610]"
...,...,...
144147,X6RLJ0,[382494]
144148,X6RLP6,[142406]
144149,X6RLU5,[130480]
144150,X6RLX0,[53610]


In [20]:
df_interpro_optimal_domains = df_interpro_domain.loc[list(df_interpro_optimal_domains['selected_domains'].explode())].drop(columns=['domain_index'])
df_interpro_optimal_domains

Unnamed: 0,uniprot_id,interpro_id,description,start_index,end_index
266181,A0A023HJ61,IPR005225,Small GTP-binding domain,15,97
33061,A0A023I7F4,IPR005797,"Cytochrome b/b6, N-terminal domain",1,209
33062,A0A023I7F4,IPR005798,"Cytochrome b/b6, C-terminal",210,380
272139,A0A023I7H2,IPR001516,"NADH-Ubiquinone oxidoreductase (complex I), ch...",68,118
272140,A0A023I7H2,IPR001750,"NADH:quinone oxidoreductase/Mrp antiporter, tr...",134,418
...,...,...,...,...,...
382494,X6RLJ0,IPR001073,C1q domain,108,220
142406,X6RLP6,IPR000504,RNA recognition motif domain,27,102
130480,X6RLU5,IPR013680,"Voltage-dependent calcium channel, alpha-2/del...",111,189
53610,X6RLX0,IPR019018,Rab-binding domain FIP-RBD,1050,1112


In [12]:
df_ted['domain_index'] = df_ted.index
df_ted_optimal_domains = find_optimal_domains(df_ted)
df_ted_optimal_domains

100%|██████████| 158744/158744 [00:57<00:00, 2764.24it/s]


Unnamed: 0,uniprot_id,selected_domains
0,A0A023HJ61,[62121]
1,A0A023I7F4,"[62122, 62123, 62124, 62125]"
2,A0A023I7H2,"[62126, 62127, 62128]"
3,A0A023I7H5,[62129]
4,A0A023I7J4,"[62130, 62131]"
...,...,...
158739,X6RLR1,[288514]
158740,X6RLX0,"[140005, 140006]"
158741,X6RLY7,"[288515, 288516]"
158742,X6RM00,"[140007, 140008]"


In [22]:
df_ted_optimal_domains = df_ted.loc[list(df_ted_optimal_domains['selected_domains'].explode())].drop(columns=['domain_index'])
df_ted_optimal_domains

Unnamed: 0,uniprot_id,ted_id,consensus_level,plddt,cath_label,start_index,end_index
62121,A0A023HJ61,AF-A0A023HJ61-F1-model_v4_TED01,high,75.9913,3.40.50.300,18,92
62122,A0A023I7F4,AF-A0A023I7F4-F1-model_v4_TED01,medium,98.0415,1.20.810.10,28,102
62123,A0A023I7F4,AF-A0A023I7F4-F1-model_v4_TED01,medium,98.0415,1.20.810.10,111,203
62124,A0A023I7F4,AF-A0A023I7F4-F1-model_v4_TED01,medium,98.0415,1.20.810.10,220,269
62125,A0A023I7F4,AF-A0A023I7F4-F1-model_v4_TED02,medium,97.8591,1.10.287,270,380
...,...,...,...,...,...,...,...
288516,X6RLY7,AF-X6RLY7-F1-model_v4_TED01,high,92.0818,3.30.450,155,230
140007,X6RM00,AF-X6RM00-F1-model_v4_TED01,medium,75.4984,1.10.287.210,146,188
140008,X6RM00,AF-X6RM00-F1-model_v4_TED02,medium,82.0507,-,814,950
62119,X6RM59,AF-X6RM59-F1-model_v4_TED01,high,96.1562,3.40.50.1000,43,94


In [23]:
def check_overlap(df):
    is_overlap = []
    for uniprot_id, group in tqdm(df.groupby('uniprot_id')):
        return_value = True
        group = group.sort_values(by=['start_index', 'end_index']).reset_index(drop=True)
        last_end = -1
        
        for _, row in group.iterrows():
            if row["start_index"] > last_end:
                last_end = row["end_index"]
            else:
                return_value = False
                break
        is_overlap.append(return_value)
    return is_overlap

In [None]:
checker = check_overlap(df_interpro_optimal_domains)
len(checker) == sum(checker)

100%|██████████| 144152/144152 [00:38<00:00, 3774.71it/s]


True

In [24]:
checker = check_overlap(df_ted_optimal_domains)
len(checker) == sum(checker)

100%|██████████| 158744/158744 [00:44<00:00, 3591.76it/s]


True