In [2]:
# imports
import numpy as np
import pandas as pd
import plotly as py
import plotly.graph_objs as go

### Data Processing on WT_pKa

In [None]:
WT_pka = pd.read_csv('WT_pka.csv')

In [None]:
WT_pka.info()

In [None]:
WT_pka.head()

In [None]:
# get rid of null columns due to file 
WT_pka.drop(WT_pka.columns[-4:], axis = 1, inplace = True)
WT_pka.head()

We are going to drop more columns that we are now not interested in.

In [None]:
WT_pka.drop(WT_pka.columns[-7:], axis = 1, inplace = True)
WT_pka.head()

In [None]:
is_NaN = WT_pka.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = WT_pka[row_has_NaN]
print(rows_with_NaN)

# This row does not have an experimental value, so we drop it
WT_pka.dropna(inplace = True)
WT_pka.isna().sum()

In [None]:
WT_pka['Res ID'] = WT_pka['Res ID'].astype(int)
WT_pka.head()

Process irregular values in Expt. pKa

In [None]:
# Create a new column 'Greater/Smaller' to keep record of Expt. pKa
WT_pka['Greater/Smaller'] = 0

WT_pka.loc[WT_pka['Expt. pKa'].str.contains(">"), 'Greater/Smaller'] = 1
WT_pka.loc[WT_pka['Expt. pKa'].str.contains("<"), 'Greater/Smaller'] = -1

WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].str.replace('>', '')
WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].str.replace('<', '')
WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].str.replace('~', '')

In [None]:
# There are two rows with two pKa valus, created a new row to store the second value
print(WT_pka[WT_pka['Expt. pKa'].str.contains(",")])
WT_pka['2nd pKa'] = 0.0
WT_pka[['Expt. pKa','2nd pKa']] = WT_pka['Expt. pKa'].str.split(',',expand=True)
WT_pka.loc[WT_pka['2nd pKa'] == 'None', '2nd pKa'] = '0'
WT_pka['Expt. pKa'] = WT_pka['Expt. pKa'].astype(float)

WT_pka['2nd pKa'] = WT_pka['2nd pKa'].astype(float)
WT_pka['2nd pKa'] = WT_pka['2nd pKa'].fillna(0)

WT_pka.info()

In [None]:
WT_pka.head()

<hr style="border:1px solid gray"> </hr>

### Data processing on individual proteins (pKa.csv and output.pqr)

#### First create a dataframe for theoretical pka values for future use

In [3]:
# theoretical value of proteins
theo_val = {'ARG': 12.0, 'ASP': 4.0, 'CYS': 9.5, 'GLU': 4.4, 'HIS': 6.3, 
               'LYS': 10.4, 'TYR': 9.6}

df_theo_val = pd.DataFrame(np.array([['ARG', 12.0], ['ASP', 4.0], ['CYS', 9.5], 
                                    ['GLU', 4.4], ['HIS', 6.3], ['LYS', 10.4], ['TYR', 9.6]]), 
                          columns = ['Res Name', 'pKa'])
df_theo_val

Unnamed: 0,Res Name,pKa
0,ARG,12.0
1,ASP,4.0
2,CYS,9.5
3,GLU,4.4
4,HIS,6.3
5,LYS,10.4
6,TYR,9.6


<hr style="border:1px solid gray"> </hr>

### We use 2ovo as an example

#### Read 2ovo pka file 

In [None]:
# rearrange pKa.csv, we use 2ovo as an example
df_2ovo = pd.read_csv('sample_data/2ovo/pKa.csv')
df_2ovo.info()

In [None]:
# We see that all the columns are now in one column, so we need to split them.
df_2ovo[list(df_2ovo.columns)[0].split()] = df_2ovo.iloc[:,0].str.split(expand=True)
df_2ovo.drop(df_2ovo.columns[0], axis = 1, inplace = True)

# Split the Res ID and Res Name from ResName
# "(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)" split digits and chars
df_2ovo[['Res Name', 'Res ID', 'Chain']] = df_2ovo.iloc[:,0].str.split("(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)", expand=True)
df_2ovo.drop(df_2ovo.columns[0], axis = 1, inplace = True)
df_2ovo['Res ID'] = df_2ovo['Res ID'].astype(int)
df_2ovo = df_2ovo[list(df_2ovo.columns)[-3:-1]+ list(df_2ovo.columns)[0:-3]]
df_2ovo = df_2ovo[list(df_2ovo.columns)[0:3]]

df_2ovo.head()

In [None]:
# Merge with theoretical values
df_2ovo.rename(columns={"pKa": "Expt. pKa"}, inplace=True)
df_2ovo = pd.merge(df_2ovo, df_theo_val, on=['Res Name'], how='inner')
df_2ovo

#### Read 2ovo pqr file 

In [None]:
file = open('sample_data/2ovo/output.pqr', 'r')
lines = file.readlines()
lines = lines[:-1]
file.close()
column_names = ['Res ID', 'x', 'y', 'z', 'Charge', 'Radius']
df_2ovo_pqr = pd.DataFrame(columns=column_names)
target_IDs = list(df_2ovo['Res ID'].unique().astype(int))
print(target_IDs)
i = 0
for line in lines:
    line = line.strip().split()
    if int(line[5]) in target_IDs:
        df_2ovo_pqr.loc[i] = line[5:] 
        i += 1
df_2ovo_pqr['Res ID'] = df_2ovo_pqr['Res ID'].astype(int)
df_2ovo_pqr[['x', 'y', 'z', 'Charge', 'Radius']] = df_2ovo_pqr[['x', 'y', 'z', 'Charge', 'Radius']].astype(float)
df_2ovo_pqr.head()

In [None]:
df_2ovo = pd.merge(df_2ovo, df_2ovo_pqr, on=['Res ID'], how='inner')
df_2ovo.head()

In [None]:

fig = go.Figure()

res_IDs = list(df_2ovo['Res ID'].unique())
data = []

for ID in res_IDs:
    res_name = list(df_2ovo.loc[(df_2ovo['Res ID']) == ID,'Res Name'].unique())[0]
    trace = go.Scatter3d(
        x=df_2ovo.loc[(df_2ovo['Res ID']) == ID,'x'],
        y=df_2ovo.loc[(df_2ovo['Res ID']) == ID,'y'],
        z=df_2ovo.loc[(df_2ovo['Res ID']) == ID,'z'],

        mode='markers',
        marker=dict(
            size=3,
            colorscale='Viridis',   
        ),
        name= res_name+' '+str(ID),

        # list comprehension to add text on hover
        text= [f"x: {a}<br>y: {b}<br>z: {c}" for a,b,c in list(zip(df_2ovo['x'], df_2ovo['y'], df_2ovo['z']))],
        # if you do not want to display x,y,z
        hoverinfo='text'
    )
    fig.add_trace(trace)
    data.append(trace)

layout = dict(title = 'TEST',)

F = dict(data=data, layout=layout)
py.offline.plot(F, filename = 'Test.html')


<hr style="border:1px solid gray"> </hr>

### For any PDBID

In [4]:
def read_csv(PDBID):
    df_PDB_csv = pd.read_csv('sample_data/' + PDBID.lower() + '/pKa.csv')
    
    # We see that all the columns are now in one column, so we need to split them.
    df_PDB_csv[list(df_PDB_csv.columns)[0].split()] = df_PDB_csv.iloc[:,0].str.split(expand=True)
    df_PDB_csv.drop(df_PDB_csv.columns[0], axis = 1, inplace = True)

    # Split the Res ID and Res Name from ResName
    # "(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)" split digits and chars
    df_PDB_csv[['Res Name', 'Res ID', 'Chain']] = df_PDB_csv.iloc[:,0].str.split("(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)", expand=True)
    df_PDB_csv.drop(df_PDB_csv.columns[0], axis = 1, inplace = True)
    df_PDB_csv['Res ID'] = df_PDB_csv['Res ID'].astype(int)
    df_PDB_csv = df_PDB_csv[list(df_PDB_csv.columns)[-3:-1]+ list(df_PDB_csv.columns)[0:-3]]
    df_PDB_csv = df_PDB_csv[list(df_PDB_csv.columns)[0:3]]
    
    # merge with theoretical values
    df_PDB_csv.rename(columns={"pKa": "Expt. pKa"}, inplace=True)
    df_PDB_csv = pd.merge(df_PDB_csv, df_theo_val, on=['Res Name'], how='inner')
    
    df_PDB_csv['Expt. pKa'] = df_PDB_csv['Expt. pKa'].astype(float)
    df_PDB_csv['pKa'] = df_PDB_csv['pKa'].astype(float)
    
    
    return df_PDB_csv

In [5]:
def read_pqr(PDBID, df_PDB_csv = None, flag = False):
    file = open('sample_data/' + PDBID.lower() + '/output.pqr', 'r')
    lines = file.readlines()
    lines = lines[:-1]
    file.close()
    
    column_names = ['Atom Name', 'Res Name', 'Res ID', 'x', 'y', 'z', 'Charge', 'Radius']
    df_PDB_pqr = pd.DataFrame(columns=column_names)
    if flag:
        target_IDs = list(df_PDB_csv['Res ID'].unique().astype(int))

    i = 0
    
    # find corresponding res ID in pqr file
    for line in lines:
        line = line.strip().split()
        if len(line) == 11:
            if flag == False:
                df_PDB_pqr.loc[i] = [line[2]] + [line[3]] + line[5:]
            elif ((flag) & (int(line[5]) in target_IDs)):
                df_PDB_pqr.loc[i] = [line[2]] + [line[3]] + line[5:]
            i += 1
            
    # convert datatype
    df_PDB_pqr['Res ID'] = df_PDB_pqr['Res ID'].astype(int)
    df_PDB_pqr[['x', 'y', 'z', 'Charge', 'Radius']] = df_PDB_pqr[['x', 'y', 'z', 'Charge', 'Radius']].astype(float)
    df_PDB_pqr.head()
    return df_PDB_pqr
    

## Visualization on a protein

In [None]:
def plot_PDB(PDBID, df_PDB):
    
    fig = go.Figure()

    res_IDs = list(df_PDB['Res ID'].unique())
    data = []

    for ID in res_IDs:
        res_name = list(df_PDB.loc[(df_PDB['Res ID']) == ID,'Res Name'].unique())[0]
        trace = go.Scatter3d(
            x=df_PDB.loc[(df_PDB['Res ID']) == ID,'x'],
            y=df_PDB.loc[(df_PDB['Res ID']) == ID,'y'],
            z=df_PDB.loc[(df_PDB['Res ID']) == ID,'z'],

            mode='markers',
            marker=dict(
                size=3,
                colorscale='Viridis',   
            ),
            name = res_name + ' ' + str(ID),
            # list comprehension to add text on hover
            text = [f"x: {a}<br>y: {b}<br>z: {c}<br>res: {d}" 
                   for a,b,c,d in list(zip(df_PDB['x'], df_PDB['y'], df_PDB['z'], [res_name + ' ' + str(ID)]*len(df_PDB['x']))) ],
            # if you do not want to display x,y,z
            hoverinfo='text'
        )
        fig.add_trace(trace)
        data.append(trace)

    layout = dict(title = PDBID.upper(),)

    F = dict(data=data, layout=layout)
    py.offline.plot(F, filename = 'sample_graphs/' +PDBID + '2.html')
    

In [None]:
def analyze_PDB(PDBID):
    df_PDB_csv = read_csv(PDBID)
    df_PDB_pqr = read_pqr(PDBID, df_PDB_csv, flag = True)
    # merge csv and pqr
    df_PDB = pd.merge(df_PDB_csv, df_PDB_pqr, on=['Res ID', 'Res Name'], how='inner')
    plot_PDB(PDBID, df_PDB)

In [None]:
sample_data = ['1bf4', '1bpi', '1igd', '1pga', '1pgb', '2ci2', '2ovo', '2qmt', '3ebx', '4pti']
# for PDBID in sample_data:
#     analyze_PDB(PDBID)
df_PDB_pqr = read_pqr('1bf4')
# plot_PDB('1bf4', df_PDB_pqr)

<hr style="border:1px solid gray"> </hr>

### Preprocess Data for Prediction
- Purpose here is to analyze the same amino acid and see if there's a pattern even amoung different proteins
- We will first use LYS and our sample_data as an experiment. Things we need to do:
    - Calculate Coulomb force on each LYS atom from all the other atoms (since looping in python is terrible, we might use matrix?)
    - We need to extract all rows of LYS from our sample proteins
    - The features that we are interested in are 'Atom name', 'Res Name', 'Res ID', 'x', 'y', 'z', 'Charge', 'Radius'
    - One observation is that for the same atom, its charge and radius are the same. 
        - Need to confirm if it's true
        - We could analyze whether the prediction behaves differently if we replace the numerical value with only the atom if we decide whether it is discrete or continuous

In [6]:
target_AA = 'ARG'
sample_data = ['1bf4', '1bpi', '1igd', '1pga', '1pgb', '2ci2', '2ovo', '2qmt', '3ebx', '4pti']

In [7]:
def extract_with_AA(PDBID, Amino_Acid):
    df_PDB_pqr_all = read_pqr(PDBID)
    df_PDB_csv = read_csv(PDBID)
    df_PDB_pqr = read_pqr(PDBID, df_PDB_csv)
    df_PDB = pd.merge(df_PDB_csv, df_PDB_pqr, on=['Res ID', 'Res Name'], how='inner')
    target_rows = df_PDB.loc[(df_PDB['Res Name'] == Amino_Acid)]
    return target_rows, df_PDB_pqr_all

In [8]:
def calculate_coulomb_force(target_row, df_PDB_pqr_all):
    coulomb_force = 0
    x = target_row['x']
    y = target_row['y']
    z = target_row['z']
    v_target = np.array((x,y,z))
    for index, row in df_PDB_pqr_all.iterrows():
        v = np.array((row['x'], row['y'], row['z']))
        d = v_target - v
        dist = (d @ d)**.5
#         dist = np.sqrt((row['x'] - x)**2 + (row['y'] - y)**2 + (row['z'] - z)**2)
        if dist == 0:
            continue
        coulomb_force = coulomb_force + row['Charge']/dist
    return coulomb_force

In [9]:
def arrange_df(PDBID):
    df_PDB, df_PDB_pqr_all = extract_with_AA(PDBID, target_AA)
    df_PDB['Columb Force'] = 0
    for index, row in df_PDB.iterrows():
        df_PDB.loc[index, 'Columb Force'] = calculate_coulomb_force(row, df_PDB_pqr_all)
    df_PDB['PDBID'] = PDBID.upper()
    return df_PDB

In [18]:
# find the same Amino Acid in all PDB
def concat_DFs(sample_data):
    first = True
    for PDBID in sample_data:
        df_PDB = arrange_df(PDBID)
        # rearrange columns
        df_PDB = df_PDB[[list(df_PDB.columns)[-1]] + [list(df_PDB.columns)[4]] + 
                          list(df_PDB.columns)[0:2] + list(df_PDB.columns)[-7:-1] + 
                          list(df_PDB.columns)[2:4]]
        if first:
            df_AA = pd.concat([df_PDB])
            first = False
        else:
            df_AA = pd.concat([df_AA, df_PDB])
    return df_AA

In [None]:
%%time
df_ASP = concat_DFs(sample_data)

In [None]:
pd.set_option('display.max_rows', None)
# df_ASP.reset_index(drop=True)
df_ASP = df_ASP.reset_index(drop=True)

In [None]:
df_ASP_copy = df_ASP.copy()

In [None]:
df_ASP_copy['Diff'] = 0
df_ASP_copy['Diff'] = df_ASP_copy['Expt. pKa'] - df_ASP_copy['pKa']

In [None]:
pivot = (max(df_ASP_copy['Expt. pKa']) + min(df_ASP_copy['Expt. pKa']))/2
print(pivot)

In [None]:
df_ASP_copy['Target'] = 0
df_ASP_copy.loc[df_ASP_copy['Expt. pKa'] >= pivot, 'Target'] = 1
df_ASP_copy.loc[df_ASP_copy['Expt. pKa'] < pivot, 'Target'] = 0

In [None]:
df_ASP_copy.rename(columns={"Columb Force": "Columb_Force"}, inplace=True)
df_ASP_copy.rename(columns={"Atom Name": "Atom_Name"}, inplace=True)

<hr style="border:1px solid gray"> </hr>

In [22]:
def rearrange_df_aa(df_AA):
    df_AA = df_AA.reset_index(drop=True)
    df_AA['Diff'] = 0
    df_AA['Diff'] = df_AA['Expt. pKa'] - df_AA['pKa']
    
    # find pivot for target
    pivot = (max(df_AA['Expt. pKa']) + min(df_AA['Expt. pKa']))/2
    
    df_AA['Target'] = 0
    df_AA.loc[df_AA['Expt. pKa'] >= pivot, 'Target'] = 1
    df_AA.loc[df_AA['Expt. pKa'] < pivot, 'Target'] = 0
    
    # rename column -- should be done at the beginning of data processing*******
    df_AA.rename(columns={"Columb Force": "Columb_Force"}, inplace=True)
    df_AA.rename(columns={"Atom Name": "Atom_Name"}, inplace=True)
    return df_AA

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def LR_pred(train_x, train_y, test_x, test_y):

    lr_sk = LogisticRegression(solver='liblinear') # all params default

    lr_sk.fit(train_x,train_y)
    yhat = lr_sk.predict(test_x)
    return accuracy_score(test_y, yhat)

In [12]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

def input_fn(features, labels, training=True, batch_size=256):
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)

In [15]:
def DNN_pred(train_x, train_y, test_x, test_y):
    my_feature_columns = []
    for key in train_x.keys():
        my_feature_columns.append(tf.feature_column.numeric_column(key=key))
    
    # Build a DNN with 3 hidden layers with 30, 20 and 10 hidden nodes each.
    classifier = tf.estimator.DNNClassifier(
        feature_columns=my_feature_columns,
        # Two hidden layers of 30 and 10 nodes respectively.
        hidden_units=[30, 20, 10],
        # The model must choose between 2 classes.
        n_classes=2)
    
    classifier.train(
        input_fn=lambda: input_fn(train_x, train_y, training=True),
        steps=5000)
    
    eval_result = classifier.evaluate(
        input_fn=lambda: input_fn(test_x, test_y, training=False))
    
    return eval_result['accuracy']

In [56]:
def pred_by_res(target_AA):
    df_AA = concat_DFs(sample_data)
    if df_AA.empty:
        return -1, -1;
    df_AA = rearrange_df_aa(df_AA)

    train, test = train_test_split(df_AA, test_size=0.2)
    
    feature_columns = ['x', 'y', 'z', 'Charge', 'Radius', 'Columb_Force']
    train_x = train[feature_columns]
    train_y = train['Target']

    test_x = test[feature_columns]
    test_y = test['Target']
    
    LR_accuracy = LR_pred(train_x, train_y, test_x, test_y)
    DNN_accuracy = DNN_pred(train_x, train_y, test_x, test_y)
    
    return LR_accuracy, DNN_accuracy

In [59]:
from sklearn.model_selection import train_test_split
from timeit import default_timer as timer

list_AA = ['ARG', 'ASP', 'CYS', 'GLU', 'HIS', 'LYS', 'TYR']
skipped = []
accuracy = []
time = []
for target_AA in list_AA:
    
    start = timer()
    LR_accuracy, DNN_accuracy = pred_by_res(target_AA)
    end = timer()
    if LR_accuracy == -1:
        skipped.append(target_AA)
        continue;
    time.append(end-start)
    accuracy.append([target_AA, LR_accuracy, DNN_accuracy])

print(time)
print(accuracy)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/bg/sjf18vvj1l30vdgzyw_lhnhw0000gn/T/tmp2g_q7ifp', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change all layers to have dtype fl

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-03-06T16:02:04Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/bg/sjf18vvj1l30vdgzyw_lhnhw0000gn/T/tmp2g_q7ifp/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 0.46531s
INFO:tensorflow:Finished evaluation at 2021-03-06-16:02:05
INFO:tensorflow:Saving dict for global step 5000: accuracy = 0.8301887, accuracy_baseline = 0.6037736, auc = 0.9296875, auc_precision_recall = 0.91244763, average_loss = 0.3301239, global_step = 5000, label/mean = 0.3962264, loss = 0.3301239, precision = 0.77272725, prediction/mean = 0.42372614, recall = 0.8095238
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 5000: /var/folders/bg/sjf18vvj1l30vdgzyw_lhnhw0000gn/T/tmp2g_q7ifp/model.ckpt-5000
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/

INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 5000...
INFO:tensorflow:Loss for final step: 0.4750985.
INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2021-03-06T16:03:40Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/bg/sjf18vvj1l30vdgzyw_lhnhw0000gn/T/tmptp1gvuan/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 0.44607s
INFO:tensorflow:Finished evaluation at 2021-03-06-16:03:40
INFO:tensorflow:Saving dict for global step 5000: accuracy = 0.6966292, accuracy_baseline = 0.6067415

INFO:tensorflow:global_step/sec: 650.754
INFO:tensorflow:loss = 0.4392882, step = 4300 (0.154 sec)
INFO:tensorflow:global_step/sec: 638.114
INFO:tensorflow:loss = 0.46983188, step = 4400 (0.157 sec)
INFO:tensorflow:global_step/sec: 640.676
INFO:tensorflow:loss = 0.45480743, step = 4500 (0.156 sec)
INFO:tensorflow:global_step/sec: 649.575
INFO:tensorflow:loss = 0.45220304, step = 4600 (0.154 sec)
INFO:tensorflow:global_step/sec: 639.5
INFO:tensorflow:loss = 0.4672308, step = 4700 (0.156 sec)
INFO:tensorflow:global_step/sec: 634.341
INFO:tensorflow:loss = 0.46169722, step = 4800 (0.158 sec)
INFO:tensorflow:global_step/sec: 647.962
INFO:tensorflow:loss = 0.442174, step = 4900 (0.154 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 5000...
INFO:tensorflow:Saving checkpoints for 5000 into /var/folders/bg/sjf18vvj1l30vdgzyw_lhnhw0000gn/T/tmph63ry04p/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 5000...
INFO:tensorflow:Loss for fina

INFO:tensorflow:global_step/sec: 658.069
INFO:tensorflow:loss = 0.21860951, step = 3400 (0.152 sec)
INFO:tensorflow:global_step/sec: 659.294
INFO:tensorflow:loss = 0.21364745, step = 3500 (0.152 sec)
INFO:tensorflow:global_step/sec: 670.997
INFO:tensorflow:loss = 0.20881946, step = 3600 (0.149 sec)
INFO:tensorflow:global_step/sec: 667.077
INFO:tensorflow:loss = 0.21586095, step = 3700 (0.150 sec)
INFO:tensorflow:global_step/sec: 661.568
INFO:tensorflow:loss = 0.20220113, step = 3800 (0.151 sec)
INFO:tensorflow:global_step/sec: 669.259
INFO:tensorflow:loss = 0.2044538, step = 3900 (0.149 sec)
INFO:tensorflow:global_step/sec: 657.876
INFO:tensorflow:loss = 0.19834119, step = 4000 (0.152 sec)
INFO:tensorflow:global_step/sec: 643.39
INFO:tensorflow:loss = 0.19982848, step = 4100 (0.155 sec)
INFO:tensorflow:global_step/sec: 662.906
INFO:tensorflow:loss = 0.19524069, step = 4200 (0.151 sec)
INFO:tensorflow:global_step/sec: 667.824
INFO:tensorflow:loss = 0.1908169, step = 4300 (0.150 sec)
INF

INFO:tensorflow:global_step/sec: 627.294
INFO:tensorflow:loss = 0.44271308, step = 2500 (0.159 sec)
INFO:tensorflow:global_step/sec: 624.906
INFO:tensorflow:loss = 0.40674892, step = 2600 (0.160 sec)
INFO:tensorflow:global_step/sec: 628.654
INFO:tensorflow:loss = 0.39554304, step = 2700 (0.159 sec)
INFO:tensorflow:global_step/sec: 629.537
INFO:tensorflow:loss = 0.4168261, step = 2800 (0.159 sec)
INFO:tensorflow:global_step/sec: 624.726
INFO:tensorflow:loss = 0.42264438, step = 2900 (0.160 sec)
INFO:tensorflow:global_step/sec: 631.845
INFO:tensorflow:loss = 0.4176863, step = 3000 (0.158 sec)
INFO:tensorflow:global_step/sec: 630.191
INFO:tensorflow:loss = 0.4214337, step = 3100 (0.159 sec)
INFO:tensorflow:global_step/sec: 618.069
INFO:tensorflow:loss = 0.41235554, step = 3200 (0.162 sec)
INFO:tensorflow:global_step/sec: 620.745
INFO:tensorflow:loss = 0.40885898, step = 3300 (0.161 sec)
INFO:tensorflow:global_step/sec: 612.471
INFO:tensorflow:loss = 0.40746403, step = 3400 (0.163 sec)
INF

In [62]:
print(sum(time)/60)

9.28843747204998


In [72]:

df_accuracy = pd.DataFrame(accuracy, 
                          columns = ['Residue Name', 'Accuracy of LR(%)', 'Accuracy of DNN(%)'])
df_accuracy['Accuracy of LR(%)'] = df_accuracy['Accuracy of LR(%)'].astype(float) * 100
df_accuracy['Accuracy of DNN(%)'] = df_accuracy['Accuracy of DNN(%)'].astype(float) * 100
df_accuracy.round(2)

Unnamed: 0,Residue Name,Accuracy of LR(%),Accuracy of DNN(%)
0,ARG,79.25,83.02
1,ASP,73.03,69.66
2,GLU,59.57,78.72
3,HIS,72.73,72.73
4,LYS,76.95,84.01


In [60]:
import plotly.express as px
import plotly.graph_objs as go

accuracy = np.array(accuracy)

fig = go.Figure()
fig.add_trace(go.Scatter(x=list_AA, y=accuracy[:,1],
                    mode='lines',
                    name='Logistice Regression'))

fig.add_trace(go.Scatter(x=list_AA, y=accuracy[:,2],
                    mode='lines',
                    name='DNN'))

fig.update_xaxes(tickangle=0)
fig.show()


<hr style="border:1px solid gray"> </hr>

#### Logistic regression: predict of Target

In [None]:

train, test = train_test_split(df_ASP_copy, test_size=0.2)

In [None]:
feature_columns = ['x', 'y', 'z', 'Charge', 'Radius', 'Columb_Force']
train_x = train[feature_columns]
train_y = train['Target']

test_x = test[feature_columns]
test_y = test['Target']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_sk = LogisticRegression(solver='liblinear') # all params default

lr_sk.fit(train_x,train_y)
print(np.hstack((lr_sk.intercept_[:,np.newaxis],lr_sk.coef_)))
yhat = lr_sk.predict(test_x)
print('Accuracy of: ', accuracy_score(test_y, yhat))

#### DNN

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

In [None]:
def input_fn(features, labels, training=True, batch_size=256):
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle and repeat if you are in training mode.
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)

In [None]:
my_feature_columns = []
for key in train_x.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
print(my_feature_columns)

In [None]:
# Build a DNN with 2 hidden layers with 30 and 10 hidden nodes each.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 30 and 10 nodes respectively.
    hidden_units=[30, 20, 10],
    # The model must choose between 2 classes.
    n_classes=2)

In [None]:
classifier.train(
    input_fn=lambda: input_fn(train_x, train_y, training=True),
    steps=5000)

In [None]:
eval_result = classifier.evaluate(
    input_fn=lambda: input_fn(test[feature_columns], test['Target'], training=False))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

<hr style="border:1px solid gray"> </hr>

<hr style="border:1px solid gray"> </hr>

<hr style="border:1px solid gray"> </hr>

<hr style="border:1px solid gray"> </hr>