In [44]:
import os
import json
import re

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso

We want to build a LASSO (or an elastic net) model with AAindex1 inputs (averaged over the 5 residues) as input and binding free energies for ACT and FEN as output predictions. This should tell us how important each physicochemical property is.

Also we want to measure how accurate the GP is in predicting __training__ data and data it hasn't seen. We can then compare what's what

In [2]:
def load_json_res(pcc, data_dir):
    with open(f"{data_dir}/{pcc}_FEN.JSON") as f:
        rep = json.load(f)
    F_fen = rep["FE"]
    F_fen_err = rep["FE_error"]

    with open(f"{data_dir}/{pcc}_DEC.JSON") as f:
        rep = json.load(f)
    F_dec = rep["FE"]
    F_dec_err = rep["FE_error"]
    return {"PCC": [rep["PCC"]], "F_FEN": [float(F_fen)], "err_FEN": [float(F_fen_err)],
             "F_DEC": [float(F_dec)], "err_DEC": [float(F_dec_err)]}

def load_data(data_dir):
    PCC_list = []
    for folder in os.listdir(data_dir):
        if re.match("[A-Z]{5}_[A-Z]{3}", folder):
            PCC_list.append(folder.split("_")[0])

    PCC_list = set(PCC_list)
    data = []
    for pcc in PCC_list:
        try:
            data.append(pd.DataFrame(load_json_res(pcc, data_dir)))
        except:
            print(f"Skipping {pcc}.")

    data = pd.concat(data)
    data.reset_index(inplace=True, drop=True)
    return data

In [None]:
dataset = load_data("../MD_results")
AAindex1 = pd.read_csv("./AAindex1.csv", index_col=0)

In [4]:
dataset

Unnamed: 0,PCC,F_FEN,err_FEN,F_DEC,err_DEC
0,YSWWW,-7.683302,0.189574,-6.550750,0.491905
1,YSAWW,-9.607431,0.377497,-6.096457,0.145546
2,GAGNG,-6.156420,0.212095,-5.845184,0.182844
3,HWWTV,-6.540684,0.360128,-5.688493,0.107787
4,HHHHH,-7.140286,0.250316,-5.715487,0.230683
...,...,...,...,...,...
145,GGAGP,-6.765136,0.284704,-8.913308,0.868681
146,WVRSP,-5.900970,0.275811,-6.163757,0.172646
147,GHGGF,-8.595857,1.034173,-9.761471,1.094273
148,YADAL,-5.775274,0.379505,-6.256796,0.617810


In [5]:
AAindex1

Unnamed: 0,property,A,R,N,D,C,Q,E,G,H,...,L,K,M,F,P,S,T,W,Y,V
0,alpha-CH chemical shifts,4.35,4.380,4.750,4.760,4.650,4.370,4.290,3.97,4.630,...,4.170,4.360,4.520,4.660,4.44,4.500,4.350,4.700,4.600,3.95
1,Hydrophobicity index,0.61,0.600,0.060,0.460,1.070,0.000,0.470,0.07,0.610,...,1.530,1.150,1.180,2.020,1.95,0.050,0.050,2.650,1.880,1.32
2,Signal sequence helical potential,1.18,0.200,0.230,0.050,1.890,0.720,0.110,0.49,0.310,...,3.230,0.060,2.670,1.960,0.76,0.970,0.840,0.770,0.390,1.08
3,Membrane-buried preference parameters,1.56,0.450,0.270,0.140,1.230,0.510,0.230,0.62,0.290,...,2.930,0.150,2.960,2.030,0.76,0.810,0.910,1.080,0.680,1.14
4,Conformational parameter of inner helix,1.00,0.520,0.350,0.440,0.060,0.440,0.730,0.35,0.600,...,1.000,0.600,1.000,0.600,0.06,0.350,0.440,0.730,0.440,0.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,Average weighted atomic number or degree based...,6.00,5.000,6.600,6.800,9.330,6.500,6.670,3.50,4.700,...,6.000,6.170,8.000,6.000,6.00,7.330,5.400,5.667,6.220,6.00
511,Weighted maximum eigenvalue based on the atomi...,12.00,23.343,27.708,28.634,28.000,27.831,28.731,7.00,24.243,...,25.021,22.739,31.344,26.993,24.00,20.000,23.819,29.778,28.252,24.00
512,Weighted minimum eigenvalue based on the atomi...,0.00,0.000,0.000,0.000,0.000,0.000,0.000,0.00,-1.734,...,0.000,-0.179,0.000,0.000,0.00,0.000,-4.227,0.211,-0.960,0.00
513,Weighted average eigenvalue based on the atomi...,6.00,10.667,10.000,10.400,11.333,10.500,10.667,3.50,10.400,...,9.600,10.167,13.600,12.000,12.00,8.667,9.000,12.750,12.222,9.00


In [6]:
lasso_data = {"PCC": []}
for pp in AAindex1.property:
    lasso_data[pp] = []

In [7]:
for pcc in dataset.PCC:
    lasso_data["PCC"].append(pcc)
    for pp in AAindex1.property:
        pps = []
        for aa in pcc:
            pps.append(AAindex1[AAindex1.property == pp][aa].values[0])
        nancnt = np.count_nonzero(np.isnan(pps))
        if nancnt == 5:
            lasso_data[pp].append(0.0)
        else:
            lasso_data[pp].append(np.nanmean(pps))

In [8]:
lasso_data = pd.DataFrame(lasso_data)
lasso_data.head(10)

Unnamed: 0,PCC,alpha-CH chemical shifts,Hydrophobicity index,Signal sequence helical potential,Membrane-buried preference parameters,Conformational parameter of inner helix,Conformational parameter of beta-structure,Conformational parameter of beta-turn,Average flexibility indices,Residue volume,...,Weighted domination number using the atomic number,Average weighted eccentricity based on the the atomic number,Weighted radius based on the atomic number,Weighted diameter based on the atomic number,Total weighted atomic number of the graph,Average weighted atomic number or degree based on atomic number in the graph,Weighted maximum eigenvalue based on the atomic numbers,Weighted minimum eigenvalue based on the atomic numbers,Weighted average eigenvalue based on the atomic numbers,Weighted second smallest eigenvalue of the weighted Laplacian matrix
0,YSWWW,4.64,1.976,0.734,0.946,0.596,0.738,0.918,0.3684,115.46,...,19.2,24.722,16.4,33.2,56.4,6.1102,27.5172,-0.0654,11.8278,2.7462
1,YSAWW,4.57,1.568,0.816,1.042,0.65,0.738,0.798,0.3788,98.9,...,15.6,20.422,14.0,27.2,45.2,6.1768,23.9616,-0.1076,10.4778,2.3374
2,GAGNG,4.202,0.176,0.576,0.738,0.48,0.654,0.85,0.4904,47.44,...,4.2,6.6,4.6,8.8,13.2014,4.62,12.1416,0.0,5.3,0.6
3,HWWTV,4.466,1.456,0.754,0.9,0.664,0.836,0.762,0.3526,103.8,...,15.0,20.2,13.6,25.8,46.8014,5.4868,26.3236,-1.1078,10.78,3.5386
4,HHHHH,4.63,0.61,0.31,0.29,0.6,0.83,0.75,0.323,91.9,...,15.0,23.1,18.0,31.0,47.0,4.7,24.243,-1.734,10.4,1.605
5,HWWVH,4.522,1.568,0.648,0.776,0.696,0.836,0.762,0.3284,107.94,...,16.8,22.34,15.6,29.2,50.8014,5.3468,26.4084,-0.6092,11.06,2.6596
6,REYHA,4.45,0.834,0.438,0.642,0.658,0.74,0.666,0.4252,90.9,...,14.0,21.8648,15.6,27.8,40.0,5.718,23.3138,-0.5388,9.9912,1.8452
7,HWWTW,4.616,1.722,0.692,0.888,0.646,0.794,0.882,0.3364,113.86,...,18.6,23.6,16.0,30.6,55.6,5.4202,27.4792,-1.0656,11.53,2.7474
8,HWWWA,4.616,1.834,0.76,1.018,0.758,0.782,0.806,0.319,110.14,...,18.6,22.32,15.6,29.0,52.6,5.5402,25.1154,-0.2202,10.93,1.5474
9,FTYWD,4.614,1.412,0.802,0.968,0.53,0.812,0.812,0.3988,101.02,...,15.6,21.466,15.2,26.4,46.6,6.0174,27.4952,-0.9952,11.2744,2.9276


In [49]:
X_fit = []
Y_fit_FEN = []
Y_fit_ACT = []

for pcc in dataset.PCC:
    X_fit.append(lasso_data[lasso_data.PCC == pcc].values[0][1:])
    Y_fit_ACT.append(dataset[dataset.PCC == pcc].F_DEC.values[0])
    Y_fit_FEN.append(dataset[dataset.PCC == pcc].F_FEN.values[0])

X_fit = np.asarray(X_fit)
Y_fit_FEN = np.asarray(Y_fit_FEN).reshape(-1, 1)
Y_fit_ACT = np.asarray(Y_fit_ACT).reshape(-1, 1)

In [50]:
scaler = MinMaxScaler()
X_fit = scaler.fit_transform(X_fit)

# FEN

In [None]:
clf_FEN = Lasso(alpha=0.1)
clf_FEN.fit(X_fit, Y_fit_FEN)

In [61]:
nz_coeffs = clf_FEN.coef_.nonzero()[0]
for i in nz_coeffs:
    print(f"{lasso_data.columns[1:][i]}: {clf_FEN.coef_[i]}")

AA composition of CYT of multi-spanning proteins: 1.231003167495737
Bitterness: -0.4161982955270977
Amphiphilicity index: -0.6639233660773761
Apparent partition energies calculated from Robson-Osguthorpe index: 0.7703057787606897
Hydropathies of amino acid side chains, pi-values in pH 7.0: -0.038770712974890695


negative Amphiphilicity index mean more hydrophobic residues are prefered for better binding (more negative x means more negative y which is the binding free energy; negative good)
Apparent partition energy is the free energy of transferring a protein from aq solution to a non-polar environment. This is also a measure of hydrophobicity.
Hydropathy pi-values are measures of hydrophibicity (positive more hydrophobic). negative correlation again means more hydrophobic, better binding.

# ACT

In [66]:
clf_ACT = Lasso(alpha=0.1)
clf_ACT.fit(X_fit, Y_fit_ACT)

In [67]:
nz_coeffs = clf_ACT.coef_.nonzero()[0]
for i in nz_coeffs:
    print(f"{lasso_data.columns[1:][i]}: {clf_ACT.coef_[i]}")

Principal component II: -0.466389073734237
Slopes tripeptide FDPB PARSE neutral: 0.018783954501949352
