In [1]:
import pandas as pd
from Hb_toolkit import *

Covert the detected hydrobond txt files to Dataframes. 
 
Hydrogen bonds were deteced in ChimeraX.

In [2]:
hb_s1a   = Path(f"hb_s1a.txt").read_text()
hb_s2b   = Path(f"hb_s2b.txt").read_text()
hb_spca1 = Path(f"hb_spca1a.txt").read_text()

df_s1a   = input_to_df(hb_s1a)
df_s2b   = input_to_df(hb_s2b)
df_spca1 = input_to_df(hb_spca1)


Add entry names to each dataframe

In [3]:
df_s1a["entry"]   = "SERCA1a"
df_s2b["entry"]   = "SERCA2b"
df_spca1["entry"] = "SPCA1a"

Check the first 10 lines

In [4]:
df_spca1.head(10)

Unnamed: 0,donor_chain,donor_residue_ID,donor_seq,donor_atom,acceptor_chain,acceptor_residue_ID,acceptor_seq,acceptor_atom,distance,entry
0,A,SER,23,OG,A,GLU,240,OE2,3.471,SPCA1a
1,A,SER,27,N,A,SER,23,O,3.046,SPCA1a
2,A,SER,27,OG,A,GLN,202,OE1,3.349,SPCA1a
3,A,GLU,28,N,A,LYS,24,O,2.92,SPCA1a
4,A,LEU,29,N,A,LYS,25,O,3.177,SPCA1a
5,A,GLU,33,N,A,GLU,33,OE1,2.753,SPCA1a
6,A,VAL,34,N,A,PRO,30,O,3.209,SPCA1a
7,A,ALA,35,N,A,VAL,31,O,2.902,SPCA1a
8,A,SER,36,N,A,SER,32,O,2.928,SPCA1a
9,A,SER,36,OG,A,GLU,33,O,2.646,SPCA1a


Check the data type of each column

In [5]:
df_spca1.dtypes

donor_chain             object
donor_residue_ID        object
donor_seq                int64
donor_atom              object
acceptor_chain          object
acceptor_residue_ID     object
acceptor_seq             int64
acceptor_atom           object
distance               float64
entry                   object
dtype: object

Read domain definition from pre-defined .csv files

In [6]:
df_domain_s1a     = pd.read_csv("../Domain_definition/s1a_each_domain_exhaustive.csv")
df_domain_s2b     = pd.read_csv("../Domain_definition/s2b_each_domain_exhaustive.csv")
df_domain_spca1   = pd.read_csv("../Domain_definition/spca1_each_domain_exhaustive.csv")

Merge domain info to dataframes

Merge

In [7]:
df_merged_s1a   = merge_domains(df_s1a, df_domain_s1a)
df_merged_s2b   = merge_domains(df_s2b, df_domain_s2b)
df_merged_spca1 = merge_domains(df_spca1, df_domain_spca1)

Concatenate dfs to a single dataframe

In [8]:
df_concat = pd.concat([df_merged_s1a,df_merged_s2b,df_merged_spca1])

Save the dataframe to .csv

In [9]:
df_concat.to_csv("output/hydrogen_bonds_summary_E1ATP.csv",index=False)

We are going to focuns on hydrogen bonds with TMs, let's filter the dataframe to get donor or acceptor residues in TMs

In [10]:
list_TMs = ["TM1"        ,
            "TM1-2_loop" ,
            "TM2"        ,
            "TM3"        ,
            "TM3-4_loop" ,
            "TM4"        ,
            "TM5"        ,
            "TM5-6_loop" ,
            "TM6"        ,
            "TM6-7_loop" ,
            "TM7"        ,
            "TM7-8_loop" ,
            "TM8"        ,
            "TM8-9_loop" ,
            "TM9"        ,
            "TM9-10_loop",
            "TM10"] 

# Select either donor residue or accepter residue belonging to TMs
condition_1 = df_concat["donor_domain"].isin(list_TMs)  | (df_concat["acceptor_domain"].isin(list_TMs)) 

df_concat_TMs_0 = df_concat.loc[condition_1].reset_index(drop=True) # drop old index

Additionally, because we're focuscing on interdomain hydrogen bonds, we'd better remove hygrogen bond formed between mainchain N and O.

In [11]:
condition_2 = (df_concat_TMs_0["donor_atom"] != "N")  | (df_concat_TMs_0["acceptor_atom"] != "O")

df_concat_TMs = df_concat_TMs_0.loc[condition_2].reset_index(drop=True) # drop old index

Detect interdomain hydrogen bonds

In [12]:
df_concat_TMs["interdomain"] = df_concat_TMs["donor_domain"] != df_concat_TMs["acceptor_domain"]

In [13]:
df_concat_TMs.to_csv("output/interdomain_E1ATP.csv", index=False)

Group ["entry","acceptor_domain","interdomain"]

In [14]:
df_acceptor = df_concat_TMs.groupby(["entry","acceptor_domain","interdomain"]).size().reset_index()
df_acceptor = df_acceptor.rename(columns = {0 : "acceptor_count"}) # rename the last column

df_donor    = df_concat_TMs.groupby(["entry","donor_domain","interdomain"]).size().reset_index()
df_donor    = df_donor.rename(columns = {0 : "donor_count"}) # rename the last column

# add a column for merge
df_acceptor["merge_domain"] = df_acceptor["acceptor_domain"]
df_donor["merge_domain"]    = df_donor["donor_domain"]

In [15]:
df_acceptor.head(10)

Unnamed: 0,entry,acceptor_domain,interdomain,acceptor_count,merge_domain
0,SERCA1a,P_domain,True,10,P_domain
1,SERCA1a,TM1,False,2,TM1
2,SERCA1a,TM1,True,1,TM1
3,SERCA1a,TM1-2_loop,True,1,TM1-2_loop
4,SERCA1a,TM10,False,3,TM10
5,SERCA1a,TM10,True,5,TM10
6,SERCA1a,TM2,False,1,TM2
7,SERCA1a,TM2,True,2,TM2
8,SERCA1a,TM3,False,4,TM3
9,SERCA1a,TM3-4_loop,False,1,TM3-4_loop


Merge df_acceptor to df_donor

In [16]:
df_merged_count = pd.merge(df_acceptor, df_donor[["entry", "interdomain", "merge_domain", "donor_count"]],
                           how = "outer",
                           on = ["entry", "merge_domain", "interdomain"])

# fill nan with zeros

df_merged_count = df_merged_count.fillna(0)

Creat a column of sum(acceptor_count + donor_count)

In [17]:
df_merged_count["total_count"] = df_merged_count["acceptor_count"] + df_merged_count["donor_count"]

Clean the dataframe

In [18]:
df_cleaned = df_merged_count[["entry", "merge_domain", "interdomain", "total_count"]]
df_cleaned = df_cleaned.rename(columns = {"merge_domain" : "domain"}) 

Because cytosolic domains is not necessary here, remove them from the dataframe.

In [19]:
df_cleaned_remove_ANP = df_cleaned.loc[df_cleaned["domain"].isin(list_TMs)]
# ANP here means A-, N-, P- domain.

Save the result

In [20]:
df_cleaned_remove_ANP.to_csv("output/Interactions_of_TMs_E1ATP.csv", index = False)