In [1]:
# imports
import numpy as np
import pandas as pd


### Data Processing on WT_pKa

In [2]:
WT_pka = pd.read_csv('WT_pka.csv')

In [3]:
WT_pka.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1345 entries, 0 to 1344
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PDB ID             1345 non-null   object 
 1   Res Name           1345 non-null   object 
 2   Chain              1345 non-null   object 
 3   Res ID             1344 non-null   float64
 4   Expt. pKa          1345 non-null   object 
 5   Expt. Uncertainty  1026 non-null   object 
 6   %SASA              1345 non-null   float64
 7   Expt. method       1343 non-null   object 
 8   0.015 M            972 non-null    object 
 9   Expt. pH           1303 non-null   object 
 10  Expt. temp         1343 non-null   object 
 11  Reference          1345 non-null   object 
 12  Unnamed: 12        0 non-null      float64
 13  Unnamed: 13        0 non-null      float64
 14  Unnamed: 14        0 non-null      float64
 15  Unnamed: 15        8 non-null      object 
dtypes: float64(5), object(11

In [4]:
WT_pka.head()

Unnamed: 0,PDB ID,Res Name,Chain,Res ID,Expt. pKa,Expt. Uncertainty,%SASA,Expt. method,0.015 M,Expt. pH,Expt. temp,Reference,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,1A2P,ASP,C,8.0,3.1,0.1,96.6,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018,,,,
1,1A2P,ASP,C,12.0,3.8,0.1,73.5,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018,,,,
2,1A2P,HIS,C,18.0,7.75,0.02,45.5,1H NMR,0.015 M,5.6-8.7,298 K,https://pubs.acs.org/doi/abs/10.1021/bi00241a021,,,,
3,1A2P,ASP,C,22.0,3.3,0.1,123.8,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018,,,,
4,1A2P,GLU,C,29.0,3.75,0.05,66.1,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018,,,,


In [5]:
# get rid of null columns due to file 
WT_pka.drop(WT_pka.columns[-4:], axis = 1, inplace = True)
WT_pka.head()

Unnamed: 0,PDB ID,Res Name,Chain,Res ID,Expt. pKa,Expt. Uncertainty,%SASA,Expt. method,0.015 M,Expt. pH,Expt. temp,Reference
0,1A2P,ASP,C,8.0,3.1,0.1,96.6,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018
1,1A2P,ASP,C,12.0,3.8,0.1,73.5,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018
2,1A2P,HIS,C,18.0,7.75,0.02,45.5,1H NMR,0.015 M,5.6-8.7,298 K,https://pubs.acs.org/doi/abs/10.1021/bi00241a021
3,1A2P,ASP,C,22.0,3.3,0.1,123.8,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018
4,1A2P,GLU,C,29.0,3.75,0.05,66.1,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018


We are going to drop more columns that we are now not interested in.

In [6]:
WT_pka.drop(WT_pka.columns[-7:], axis = 1, inplace = True)
WT_pka.head()

Unnamed: 0,PDB ID,Res Name,Chain,Res ID,Expt. pKa
0,1A2P,ASP,C,8.0,3.1
1,1A2P,ASP,C,12.0,3.8
2,1A2P,HIS,C,18.0,7.75
3,1A2P,ASP,C,22.0,3.3
4,1A2P,GLU,C,29.0,3.75


In [7]:
is_NaN = WT_pka.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = WT_pka[row_has_NaN]
print(rows_with_NaN)

# This row does not have an experimental value, so we drop it
WT_pka.dropna(inplace = True)
WT_pka.isna().sum()

   PDB ID Res Name Chain  Res ID Expt. pKa
89   1BF4   C-term     A     NaN       3.4


PDB ID       0
Res Name     0
Chain        0
Res ID       0
Expt. pKa    0
dtype: int64

In [8]:
WT_pka['Res ID'] = WT_pka['Res ID'].astype(int)
WT_pka.head()

Unnamed: 0,PDB ID,Res Name,Chain,Res ID,Expt. pKa
0,1A2P,ASP,C,8,3.1
1,1A2P,ASP,C,12,3.8
2,1A2P,HIS,C,18,7.75
3,1A2P,ASP,C,22,3.3
4,1A2P,GLU,C,29,3.75


Process irregular values in Expt. pKa

In [9]:
# Create a new column 'Greater/Smaller' to keep record of Expt. pKa
WT_pka['Greater/Smaller'] = 0

WT_pka.loc[WT_pka['Expt. pKa'].str.contains(">"), 'Greater/Smaller'] = 1
WT_pka.loc[WT_pka['Expt. pKa'].str.contains("<"), 'Greater/Smaller'] = -1

WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].str.replace('>', '')
WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].str.replace('<', '')
WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].str.replace('~', '')

In [10]:
# There are two rows with two pKa valus, created a new row to store the second value
print(WT_pka[WT_pka['Expt. pKa'].str.contains(",")])
WT_pka['2nd pKa'] = 0.0
WT_pka[['Expt. pKa','2nd pKa']] = WT_pka['Expt. pKa'].str.split(',',expand=True)
WT_pka.loc[WT_pka['2nd pKa'] == 'None', '2nd pKa'] = '0'
WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].astype(float)

WT_pka['2nd pKa'] = WT_pka['2nd pKa'].astype(float)
WT_pka['2nd pKa'] = WT_pka['2nd pKa'].fillna(0)

WT_pka.info()

    PDB ID Res Name Chain  Res ID  Expt. pKa  Greater/Smaller
998   1STN      ASP     A      19  2.21,6.54                0
999   1STN      ASP     A      21  3.01,6.54                0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1344 entries, 0 to 1344
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PDB ID           1344 non-null   object 
 1   Res Name         1344 non-null   object 
 2   Chain            1344 non-null   object 
 3   Res ID           1344 non-null   int64  
 4   Expt. pKa        1344 non-null   float64
 5   Greater/Smaller  1344 non-null   int64  
 6   2nd pKa          1344 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 84.0+ KB


In [11]:
WT_pka.head()

Unnamed: 0,PDB ID,Res Name,Chain,Res ID,Expt. pKa,Greater/Smaller,2nd pKa
0,1A2P,ASP,C,8,3.1,0,0.0
1,1A2P,ASP,C,12,3.8,0,0.0
2,1A2P,HIS,C,18,7.75,0,0.0
3,1A2P,ASP,C,22,3.3,0,0.0
4,1A2P,GLU,C,29,3.75,0,0.0


In [12]:
WT_pka['Res ID'].describe()

count    1344.000000
mean       54.928571
std        43.957140
min         1.000000
25%        21.000000
50%        46.000000
75%        81.000000
max       403.000000
Name: Res ID, dtype: float64

<hr style="border:1px solid gray"> </hr>

### Data processing on individual proteins

In [13]:
# rearrange pKa.csv, we use 2ovo as an example
df_2ovo = pd.read_csv('sample_data/2ovo/pKa.csv')
df_2ovo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 1 columns):
 #   Column                                                                                               Non-Null Count  Dtype 
---  ------                                                                                               --------------  ----- 
 0    ResName           pKa    polar(charged) polar(neutral) de-solvation(charged) de-solvation(neutral)  11 non-null     object
dtypes: object(1)
memory usage: 216.0+ bytes


In [14]:
# We see that all the columns are now in one column, so we need to split them.
df_2ovo[list(df_2ovo.columns)[0].split()] = df_2ovo.iloc[:,0].str.split(expand=True)
df_2ovo.drop(df_2ovo.columns[0], axis = 1, inplace = True)

# Split the Res ID and Res Name from ResName
# "(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)" split digits and chars
df_2ovo[['Res Name', 'Res ID', 'Chain']] = df_2ovo.iloc[:,0].str.split("(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)", expand=True)
df_2ovo.drop(df_2ovo.columns[0], axis = 1, inplace = True)
df_2ovo['Res ID'] = df_2ovo['Res ID'].astype(int)
df_2ovo

Unnamed: 0,pKa,polar(charged),polar(neutral),de-solvation(charged),de-solvation(neutral),Res Name,Res ID,Chain
0,3.25,-0.5642,-0.1424,0.1096,0.0545,ASP,7,A
1,3.84,-0.0492,0.0119,0.0149,0.0008,GLU,10,A
2,11.03,-0.2345,-0.0355,0.0131,0.0066,LYS,13,A
3,3.16,-0.5523,-0.1113,0.0543,0.0053,GLU,19,A
4,12.19,0.0881,0.0128,-0.0014,0.006,ARG,21,A
5,3.48,-0.5006,-0.2668,0.2027,0.0701,ASP,27,A
6,10.92,0.0915,0.0121,0.0267,0.0057,LYS,29,A
7,10.83,0.2115,0.0098,0.07,0.0162,LYS,34,A
8,3.99,0.0302,-0.0007,0.0168,0.0013,GLU,43,A
9,6.71,-0.6757,-0.3554,0.1786,0.0907,HIS,52,A


<hr style="border:1px solid gray"> </hr>

In [15]:
sample_proteins = ['1bf4','1bpi','1igd','1pga','1pgb','2ci2','2ovo','2qmt','3ebx','4pti']
# sample_proteins = ['2ovo']
first = True
for p in sample_proteins:
    p_path = 'sample_data/' + p + '/pKa.csv'
    df_p = pd.read_csv(p_path)
    
    # Split columns
    df_p[list(df_p.columns)[0].split()] = df_p.iloc[:,0].str.split(expand=True)
    df_p.drop(df_p.columns[0], axis = 1, inplace = True)
    
    df_p['PDB ID'] = p.upper()
    
    df_p[['Res Name', 'Res ID', 'Chain']] = df_p.iloc[:,0].str.split("(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)", expand=True)
    df_p.drop(df_p.columns[0], axis = 1, inplace = True)
    df_p['Res ID'] = df_p['Res ID'].astype(int)
    df_p.iloc[:, 0:5] = df_p.iloc[:, 0:5].astype(float)
    if first: 
        df_exp = pd.concat([df_p])
        first = False
    else:
        df_exp = pd.concat([df_exp, df_p])
        
# switch the order of columns
list(df_p.columns)
df_exp = df_exp[list(df_p.columns)[-4:-1]+ list(df_p.columns)[0:-4]]
df_exp.head()

Unnamed: 0,PDB ID,Res Name,Res ID,pKa,polar(charged),polar(neutral),de-solvation(charged),de-solvation(neutral)
0,1BF4,DC,102,4.45,-1.0813,-0.769,1.6466,1.2841
1,1BF4,DC,106,5.15,-0.532,-0.5987,0.8731,1.2789
2,1BF4,DC,108,4.24,-0.8316,-0.2539,1.2195,0.3493
3,1BF4,DC,110,4.42,-1.3488,-0.9988,2.0543,1.6202
4,1BF4,DA,112,4.73,-0.3272,-0.2007,1.4159,1.947


In [16]:
df1 = pd.merge(df_exp, WT_pka, on=['PDB ID', 'Res ID'], how='inner')
# df1 103 rows: 
# 12 1BPI ARG 1 N-term A 8.10
# 94 4PTI ARG 1 N-term

In [17]:
df2 = pd.merge(df_exp, WT_pka, on=['PDB ID', 'Res ID', 'Res Name'], how='inner')

In [18]:
df2

Unnamed: 0,PDB ID,Res Name,Res ID,pKa,polar(charged),polar(neutral),de-solvation(charged),de-solvation(neutral),Chain,Expt. pKa,Greater/Smaller,2nd pKa
0,1BF4,LYS,7,11.58,-0.1352,-0.0128,0.0241,0.0096,A,11.5,1,0.0
1,1BF4,GLU,11,3.91,0.1404,-0.0433,0.0059,0.0028,A,3.5,0,0.0
2,1BF4,GLU,12,3.05,0.1592,0.0492,0.1336,0.0294,A,3.5,0,0.0
3,1BF4,LYS,13,11.26,-0.3688,-0.0428,0.0585,0.0085,A,10.0,1,0.0
4,1BF4,ASP,16,3.84,-0.0913,-0.0776,0.0991,0.0283,A,2.9,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
96,4PTI,LYS,26,10.77,-0.0404,-0.0036,0.0093,0.0045,A,10.6,0,0.0
97,4PTI,LYS,41,10.80,0.0037,0.0190,0.0298,0.0066,A,10.8,0,0.0
98,4PTI,LYS,46,10.53,0.0131,0.0013,-0.0035,0.0061,A,10.6,0,0.0
99,4PTI,GLU,49,3.72,-0.5678,-0.2029,0.0337,0.0086,A,3.8,0,0.0


In [19]:
df2['2nd pKa'].describe()

count    101.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: 2nd pKa, dtype: float64

In [20]:
df2.drop(df2.columns[-1], axis = 1, inplace = True)

In [21]:
df2

Unnamed: 0,PDB ID,Res Name,Res ID,pKa,polar(charged),polar(neutral),de-solvation(charged),de-solvation(neutral),Chain,Expt. pKa,Greater/Smaller
0,1BF4,LYS,7,11.58,-0.1352,-0.0128,0.0241,0.0096,A,11.5,1
1,1BF4,GLU,11,3.91,0.1404,-0.0433,0.0059,0.0028,A,3.5,0
2,1BF4,GLU,12,3.05,0.1592,0.0492,0.1336,0.0294,A,3.5,0
3,1BF4,LYS,13,11.26,-0.3688,-0.0428,0.0585,0.0085,A,10.0,1
4,1BF4,ASP,16,3.84,-0.0913,-0.0776,0.0991,0.0283,A,2.9,0
...,...,...,...,...,...,...,...,...,...,...,...
96,4PTI,LYS,26,10.77,-0.0404,-0.0036,0.0093,0.0045,A,10.6,0
97,4PTI,LYS,41,10.80,0.0037,0.0190,0.0298,0.0066,A,10.8,0
98,4PTI,LYS,46,10.53,0.0131,0.0013,-0.0035,0.0061,A,10.6,0
99,4PTI,GLU,49,3.72,-0.5678,-0.2029,0.0337,0.0086,A,3.8,0


In [24]:
df2['Diff'] = df2['pKa'] - df2['Expt. pKa']
df2

Unnamed: 0,PDB ID,Res Name,Res ID,pKa,polar(charged),polar(neutral),de-solvation(charged),de-solvation(neutral),Chain,Expt. pKa,Greater/Smaller,Diff
0,1BF4,LYS,7,11.58,-0.1352,-0.0128,0.0241,0.0096,A,11.5,1,0.08
1,1BF4,GLU,11,3.91,0.1404,-0.0433,0.0059,0.0028,A,3.5,0,0.41
2,1BF4,GLU,12,3.05,0.1592,0.0492,0.1336,0.0294,A,3.5,0,-0.45
3,1BF4,LYS,13,11.26,-0.3688,-0.0428,0.0585,0.0085,A,10.0,1,1.26
4,1BF4,ASP,16,3.84,-0.0913,-0.0776,0.0991,0.0283,A,2.9,0,0.94
...,...,...,...,...,...,...,...,...,...,...,...,...
96,4PTI,LYS,26,10.77,-0.0404,-0.0036,0.0093,0.0045,A,10.6,0,0.17
97,4PTI,LYS,41,10.80,0.0037,0.0190,0.0298,0.0066,A,10.8,0,0.00
98,4PTI,LYS,46,10.53,0.0131,0.0013,-0.0035,0.0061,A,10.6,0,-0.07
99,4PTI,GLU,49,3.72,-0.5678,-0.2029,0.0337,0.0086,A,3.8,0,-0.08


In [37]:
df2['Target'] = 0
df2.loc[(df2['Diff'] > 0) & (df2['Greater/Smaller'] == 0), 'Target'] = 1
df2.loc[df2['Diff'] < 0, 'Target'] = -1
df2.loc[df2['Diff'] == 0, 'Target'] = 0
df2

Unnamed: 0,PDB ID,Res Name,Res ID,pKa,polar(charged),polar(neutral),de-solvation(charged),de-solvation(neutral),Chain,Expt. pKa,Greater/Smaller,Diff,Target
0,1BF4,LYS,7,11.58,-0.1352,-0.0128,0.0241,0.0096,A,11.5,1,0.08,0
1,1BF4,GLU,11,3.91,0.1404,-0.0433,0.0059,0.0028,A,3.5,0,0.41,1
2,1BF4,GLU,12,3.05,0.1592,0.0492,0.1336,0.0294,A,3.5,0,-0.45,-1
3,1BF4,LYS,13,11.26,-0.3688,-0.0428,0.0585,0.0085,A,10.0,1,1.26,0
4,1BF4,ASP,16,3.84,-0.0913,-0.0776,0.0991,0.0283,A,2.9,0,0.94,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,4PTI,LYS,26,10.77,-0.0404,-0.0036,0.0093,0.0045,A,10.6,0,0.17,1
97,4PTI,LYS,41,10.80,0.0037,0.0190,0.0298,0.0066,A,10.8,0,0.00,0
98,4PTI,LYS,46,10.53,0.0131,0.0013,-0.0035,0.0061,A,10.6,0,-0.07,-1
99,4PTI,GLU,49,3.72,-0.5678,-0.2029,0.0337,0.0086,A,3.8,0,-0.08,-1
