In [1]:
# imports
import numpy as np
import pandas as pd
import plotly as py
import plotly.graph_objs as go

### Data Processing on WT_pKa

In [2]:
WT_pka = pd.read_csv('WT_pka.csv')

In [3]:
WT_pka.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1345 entries, 0 to 1344
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   PDB ID             1345 non-null   object 
 1   Res Name           1345 non-null   object 
 2   Chain              1345 non-null   object 
 3   Res ID             1344 non-null   float64
 4   Expt. pKa          1345 non-null   object 
 5   Expt. Uncertainty  1026 non-null   object 
 6   %SASA              1345 non-null   float64
 7   Expt. method       1343 non-null   object 
 8   0.015 M            972 non-null    object 
 9   Expt. pH           1303 non-null   object 
 10  Expt. temp         1343 non-null   object 
 11  Reference          1345 non-null   object 
 12  Unnamed: 12        0 non-null      float64
 13  Unnamed: 13        0 non-null      float64
 14  Unnamed: 14        0 non-null      float64
 15  Unnamed: 15        8 non-null      object 
dtypes: float64(5), object(11

In [4]:
WT_pka.head()

Unnamed: 0,PDB ID,Res Name,Chain,Res ID,Expt. pKa,Expt. Uncertainty,%SASA,Expt. method,0.015 M,Expt. pH,Expt. temp,Reference,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,1A2P,ASP,C,8.0,3.1,0.1,96.6,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018,,,,
1,1A2P,ASP,C,12.0,3.8,0.1,73.5,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018,,,,
2,1A2P,HIS,C,18.0,7.75,0.02,45.5,1H NMR,0.015 M,5.6-8.7,298 K,https://pubs.acs.org/doi/abs/10.1021/bi00241a021,,,,
3,1A2P,ASP,C,22.0,3.3,0.1,123.8,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018,,,,
4,1A2P,GLU,C,29.0,3.75,0.05,66.1,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018,,,,


In [5]:
# get rid of null columns due to file 
WT_pka.drop(WT_pka.columns[-4:], axis = 1, inplace = True)
WT_pka.head()

Unnamed: 0,PDB ID,Res Name,Chain,Res ID,Expt. pKa,Expt. Uncertainty,%SASA,Expt. method,0.015 M,Expt. pH,Expt. temp,Reference
0,1A2P,ASP,C,8.0,3.1,0.1,96.6,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018
1,1A2P,ASP,C,12.0,3.8,0.1,73.5,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018
2,1A2P,HIS,C,18.0,7.75,0.02,45.5,1H NMR,0.015 M,5.6-8.7,298 K,https://pubs.acs.org/doi/abs/10.1021/bi00241a021
3,1A2P,ASP,C,22.0,3.3,0.1,123.8,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018
4,1A2P,GLU,C,29.0,3.75,0.05,66.1,2D 1H NMR,0.015 M,0.2-6.3,303 K,https://pubs.acs.org/doi/abs/10.1021/bi00029a018


We are going to drop more columns that we are now not interested in.

In [6]:
WT_pka.drop(WT_pka.columns[-7:], axis = 1, inplace = True)
WT_pka.head()

Unnamed: 0,PDB ID,Res Name,Chain,Res ID,Expt. pKa
0,1A2P,ASP,C,8.0,3.1
1,1A2P,ASP,C,12.0,3.8
2,1A2P,HIS,C,18.0,7.75
3,1A2P,ASP,C,22.0,3.3
4,1A2P,GLU,C,29.0,3.75


In [7]:
is_NaN = WT_pka.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = WT_pka[row_has_NaN]
print(rows_with_NaN)

# This row does not have an experimental value, so we drop it
WT_pka.dropna(inplace = True)
WT_pka.isna().sum()

   PDB ID Res Name Chain  Res ID Expt. pKa
89   1BF4   C-term     A     NaN       3.4


PDB ID       0
Res Name     0
Chain        0
Res ID       0
Expt. pKa    0
dtype: int64

In [8]:
WT_pka['Res ID'] = WT_pka['Res ID'].astype(int)
WT_pka.head()

Unnamed: 0,PDB ID,Res Name,Chain,Res ID,Expt. pKa
0,1A2P,ASP,C,8,3.1
1,1A2P,ASP,C,12,3.8
2,1A2P,HIS,C,18,7.75
3,1A2P,ASP,C,22,3.3
4,1A2P,GLU,C,29,3.75


Process irregular values in Expt. pKa

In [9]:
# Create a new column 'Greater/Smaller' to keep record of Expt. pKa
WT_pka['Greater/Smaller'] = 0

WT_pka.loc[WT_pka['Expt. pKa'].str.contains(">"), 'Greater/Smaller'] = 1
WT_pka.loc[WT_pka['Expt. pKa'].str.contains("<"), 'Greater/Smaller'] = -1

WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].str.replace('>', '')
WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].str.replace('<', '')
WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].str.replace('~', '')

In [10]:
# There are two rows with two pKa valus, created a new row to store the second value
print(WT_pka[WT_pka['Expt. pKa'].str.contains(",")])
WT_pka['2nd pKa'] = 0.0
WT_pka[['Expt. pKa','2nd pKa']] = WT_pka['Expt. pKa'].str.split(',',expand=True)
WT_pka.loc[WT_pka['2nd pKa'] == 'None', '2nd pKa'] = '0'
WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].astype(float)

WT_pka['2nd pKa'] = WT_pka['2nd pKa'].astype(float)
WT_pka['2nd pKa'] = WT_pka['2nd pKa'].fillna(0)

WT_pka.info()

    PDB ID Res Name Chain  Res ID  Expt. pKa  Greater/Smaller
998   1STN      ASP     A      19  2.21,6.54                0
999   1STN      ASP     A      21  3.01,6.54                0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1344 entries, 0 to 1344
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PDB ID           1344 non-null   object 
 1   Res Name         1344 non-null   object 
 2   Chain            1344 non-null   object 
 3   Res ID           1344 non-null   int64  
 4   Expt. pKa        1344 non-null   float64
 5   Greater/Smaller  1344 non-null   int64  
 6   2nd pKa          1344 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 84.0+ KB


In [11]:
WT_pka.head()

Unnamed: 0,PDB ID,Res Name,Chain,Res ID,Expt. pKa,Greater/Smaller,2nd pKa
0,1A2P,ASP,C,8,3.1,0,0.0
1,1A2P,ASP,C,12,3.8,0,0.0
2,1A2P,HIS,C,18,7.75,0,0.0
3,1A2P,ASP,C,22,3.3,0,0.0
4,1A2P,GLU,C,29,3.75,0,0.0


<hr style="border:1px solid gray"> </hr>

### Data processing on individual proteins (pKa.csv and output.pqr)

#### First create a dataframe for theoretical pka values for future use

In [12]:
# theoretical value of proteins
theo_val = {'ARG': 12.0, 'ASP': 4.0, 'CYS': 9.5, 'GLU': 4.4, 'HIS': 6.3, 
               'LYS': 10.4, 'TYR': 9.6}

df_theo_val = pd.DataFrame(np.array([['ARG', 12.0], ['ASP', 4.0], ['CYS', 9.5], 
                                    ['GLU', 4.4], ['HIS', 6.3], ['LYS', 10.4], ['TYR', 9.6]]), 
                          columns = ['Res Name', 'pKa'])
df_theo_val

Unnamed: 0,Res Name,pKa
0,ARG,12.0
1,ASP,4.0
2,CYS,9.5
3,GLU,4.4
4,HIS,6.3
5,LYS,10.4
6,TYR,9.6


<hr style="border:1px solid gray"> </hr>

### We use 2ovo as an example

#### Read 2ovo pka file 

In [13]:
# rearrange pKa.csv, we use 2ovo as an example
df_2ovo = pd.read_csv('sample_data/2ovo/pKa.csv')
df_2ovo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 1 columns):
 #   Column                                                                                               Non-Null Count  Dtype 
---  ------                                                                                               --------------  ----- 
 0    ResName           pKa    polar(charged) polar(neutral) de-solvation(charged) de-solvation(neutral)  11 non-null     object
dtypes: object(1)
memory usage: 216.0+ bytes


In [14]:
# We see that all the columns are now in one column, so we need to split them.
df_2ovo[list(df_2ovo.columns)[0].split()] = df_2ovo.iloc[:,0].str.split(expand=True)
df_2ovo.drop(df_2ovo.columns[0], axis = 1, inplace = True)

# Split the Res ID and Res Name from ResName
# "(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)" split digits and chars
df_2ovo[['Res Name', 'Res ID', 'Chain']] = df_2ovo.iloc[:,0].str.split("(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)", expand=True)
df_2ovo.drop(df_2ovo.columns[0], axis = 1, inplace = True)
df_2ovo['Res ID'] = df_2ovo['Res ID'].astype(int)
df_2ovo = df_2ovo[list(df_2ovo.columns)[-3:-1]+ list(df_2ovo.columns)[0:-3]]
df_2ovo = df_2ovo[list(df_2ovo.columns)[0:3]]

df_2ovo.head()

Unnamed: 0,Res Name,Res ID,pKa
0,ASP,7,3.25
1,GLU,10,3.84
2,LYS,13,11.03
3,GLU,19,3.16
4,ARG,21,12.19


In [15]:
# Merge with theoretical values
df_2ovo.rename(columns={"pKa": "Expt. pKa"}, inplace=True)
df_2ovo = pd.merge(df_2ovo, df_theo_val, on=['Res Name'], how='inner')
df_2ovo

Unnamed: 0,Res Name,Res ID,Expt. pKa,pKa
0,ASP,7,3.25,4.0
1,ASP,27,3.48,4.0
2,GLU,10,3.84,4.4
3,GLU,19,3.16,4.4
4,GLU,43,3.99,4.4
5,LYS,13,11.03,10.4
6,LYS,29,10.92,10.4
7,LYS,34,10.83,10.4
8,LYS,55,10.71,10.4
9,ARG,21,12.19,12.0


#### Read 2ovo pqr file 

In [16]:
file = open('sample_data/2ovo/output.pqr', 'r')
lines = file.readlines()
lines = lines[:-1]
file.close()
column_names = ['Res ID', 'x', 'y', 'z', 'Charge', 'Radius']
df_2ovo_pqr = pd.DataFrame(columns=column_names)
target_IDs = list(df_2ovo['Res ID'].unique().astype(int))
print(target_IDs)
i = 0
for line in lines:
    line = line.strip().split()
    if int(line[5]) in target_IDs:
        df_2ovo_pqr.loc[i] = line[5:] 
        i += 1
df_2ovo_pqr['Res ID'] = df_2ovo_pqr['Res ID'].astype(int)
df_2ovo_pqr[['x', 'y', 'z', 'Charge', 'Radius']] = df_2ovo_pqr[['x', 'y', 'z', 'Charge', 'Radius']].astype(float)
df_2ovo_pqr.head()

[7, 27, 10, 19, 43, 13, 29, 34, 55, 21, 52]


Unnamed: 0,Res ID,x,y,z,Charge,Radius
0,7,0.211,7.348,5.302,-0.5163,1.824
1,7,1.351,7.155,4.347,0.2936,0.6
2,7,-1.146,7.262,4.785,0.0381,1.908
3,7,-1.738,8.069,5.169,0.088,1.387
4,7,-1.808,5.945,5.212,0.5366,1.908


In [17]:
df_2ovo = pd.merge(df_2ovo, df_2ovo_pqr, on=['Res ID'], how='inner')
df_2ovo.head()

Unnamed: 0,Res Name,Res ID,Expt. pKa,pKa,x,y,z,Charge,Radius
0,ASP,7,3.25,4.0,0.211,7.348,5.302,-0.5163,1.824
1,ASP,7,3.25,4.0,1.351,7.155,4.347,0.2936,0.6
2,ASP,7,3.25,4.0,-1.146,7.262,4.785,0.0381,1.908
3,ASP,7,3.25,4.0,-1.738,8.069,5.169,0.088,1.387
4,ASP,7,3.25,4.0,-1.808,5.945,5.212,0.5366,1.908


In [18]:

fig = go.Figure()

res_IDs = list(df_2ovo['Res ID'].unique())
data = []

for ID in res_IDs:
    res_name = list(df_2ovo.loc[(df_2ovo['Res ID']) == ID,'Res Name'].unique())[0]
    trace = go.Scatter3d(
        x=df_2ovo.loc[(df_2ovo['Res ID']) == ID,'x'],
        y=df_2ovo.loc[(df_2ovo['Res ID']) == ID,'y'],
        z=df_2ovo.loc[(df_2ovo['Res ID']) == ID,'z'],

        mode='markers',
        marker=dict(
            size=3,
            colorscale='Viridis',   
        ),
        name= res_name+' '+str(ID),

        # list comprehension to add text on hover
        text= [f"x: {a}<br>y: {b}<br>z: {c}" for a,b,c in list(zip(df_2ovo['x'], df_2ovo['y'], df_2ovo['z']))],
        # if you do not want to display x,y,z
        hoverinfo='text'
    )
    fig.add_trace(trace)
    data.append(trace)

layout = dict(title = 'TEST',)

F = dict(data=data, layout=layout)
py.offline.plot(F, filename = 'Test.html')


'Test.html'

<hr style="border:1px solid gray"> </hr>

### For any PDBID

In [29]:
def read_csv(PDBID):
    df_PDB_csv = pd.read_csv('sample_data/' + PDBID.lower() + '/pKa.csv')
    
    # We see that all the columns are now in one column, so we need to split them.
    df_PDB_csv[list(df_PDB_csv.columns)[0].split()] = df_PDB_csv.iloc[:,0].str.split(expand=True)
    df_PDB_csv.drop(df_PDB_csv.columns[0], axis = 1, inplace = True)

    # Split the Res ID and Res Name from ResName
    # "(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)" split digits and chars
    df_PDB_csv[['Res Name', 'Res ID', 'Chain']] = df_PDB_csv.iloc[:,0].str.split("(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)", expand=True)
    df_PDB_csv.drop(df_PDB_csv.columns[0], axis = 1, inplace = True)
    df_PDB_csv['Res ID'] = df_PDB_csv['Res ID'].astype(int)
    df_PDB_csv = df_PDB_csv[list(df_PDB_csv.columns)[-3:-1]+ list(df_PDB_csv.columns)[0:-3]]
    df_PDB_csv = df_PDB_csv[list(df_PDB_csv.columns)[0:3]]
    
    # merge with theoretical values
    df_PDB_csv.rename(columns={"pKa": "Expt. pKa"}, inplace=True)
    df_PDB_csv = pd.merge(df_PDB_csv, df_theo_val, on=['Res Name'], how='inner')
    
    return df_PDB_csv

In [42]:
def read_pqr(PDBID, df_PDB_csv):
    file = open('sample_data/' + PDBID.lower() + '/output.pqr', 'r')
    lines = file.readlines()
    lines = lines[:-1]
    file.close()
    
    column_names = ['Res ID', 'x', 'y', 'z', 'Charge', 'Radius']
    df_PDB_pqr = pd.DataFrame(columns=column_names)
    target_IDs = list(df_PDB_csv['Res ID'].unique().astype(int))

    i = 0
    
    # find corresponding res ID in pqr file
    for line in lines:
        line = line.strip().split()
        if len(line) == 11:
            if int(line[5]) in target_IDs:
                df_PDB_pqr.loc[i] = line[5:] 
                i += 1
            
    # convert datatype
    df_PDB_pqr['Res ID'] = df_PDB_pqr['Res ID'].astype(int)
    df_PDB_pqr[['x', 'y', 'z', 'Charge', 'Radius']] = df_PDB_pqr[['x', 'y', 'z', 'Charge', 'Radius']].astype(float)
    df_PDB_pqr.head()
    return df_PDB_pqr
    

In [62]:
def plot_PDB(PDBID, df_PDB):
    
    fig = go.Figure()

    res_IDs = list(df_PDB['Res ID'].unique())
    data = []

    for ID in res_IDs:
        res_name = list(df_PDB.loc[(df_PDB['Res ID']) == ID,'Res Name'].unique())[0]
        trace = go.Scatter3d(
            x=df_PDB.loc[(df_PDB['Res ID']) == ID,'x'],
            y=df_PDB.loc[(df_PDB['Res ID']) == ID,'y'],
            z=df_PDB.loc[(df_PDB['Res ID']) == ID,'z'],

            mode='markers',
            marker=dict(
                size=3,
                colorscale='Viridis',   
            ),
            name= res_name+' '+str(ID),
            # list comprehension to add text on hover
            text= [f"x: {a}<br>y: {b}<br>z: {c}<br>res: {d} {e} " 
                   for a,b,c,d,e in list(zip(df_PDB['x'], df_PDB['y'], df_PDB['z'], df_PDB['Res Name'], df_PDB['Res ID']))],
            # if you do not want to display x,y,z
            hoverinfo='text'
        )
        fig.add_trace(trace)
        data.append(trace)

    layout = dict(title = PDBID.upper(),)

    F = dict(data=data, layout=layout)
    py.offline.plot(F, filename = 'sample_graphs/' +PDBID + '.html')
    

In [63]:
def analyze_PDB(PDBID):
    df_PDB_csv = read_csv(PDBID)
    df_PDB_pqr = read_pqr(PDBID, df_PDB_csv)
    
    # merge csv and pqr
    df_PDB = pd.merge(df_PDB_csv, df_PDB_pqr, on=['Res ID'], how='inner')
    
    plot_PDB(PDBID, df_PDB)

In [None]:
sample_PDB = ['1bf4','1bpi','1igd','1pga','1pgb','2ci2','2ovo','2qmt','3ebx','4pti']
for p in sample_PDB:
    analyze_PDB(p)

<hr style="border:1px solid gray"> </hr>

<hr style="border:1px solid gray"> </hr>

<hr style="border:1px solid gray"> </hr>

<hr style="border:1px solid gray"> </hr>

<hr style="border:1px solid gray"> </hr>