In [1]:
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from evolvepro.src.data import load_dms_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel, Matern
from sklearn.metrics import mean_absolute_error, r2_score
import pca_visualisation

In [42]:
# load dataset
dataset_name = "jones"
model_name = "esm1b_t33_650M_UR50S"
embeddings_path = "output/dms/embeddings"
labels_path = "output/dms/labels"
embeddings_file_type = "csv"
embeddings_type_pt = "average"
embeddings, labels = load_dms_data(dataset_name, model_name, embeddings_path, labels_path, embeddings_file_type, embeddings_type_pt)
labels.set_index('variant', drop=False, inplace=True)
embeddings = embeddings.iloc[0:1000, :]
embeddings

Embeddings and labels are aligned


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
A119C,-0.122242,0.179708,0.087274,-0.025090,0.038957,-0.179957,-0.265103,0.132534,-0.164186,-0.095780,...,0.124604,-0.065324,-0.134201,-0.066585,-0.034155,0.229733,0.030430,0.083601,-0.073032,-0.000493
A119D,-0.124071,0.182009,0.091537,-0.030904,0.041917,-0.182394,-0.267248,0.136030,-0.167182,-0.096932,...,0.122950,-0.066489,-0.140128,-0.069989,-0.033192,0.232354,0.032388,0.076987,-0.069964,-0.001166
A119E,-0.128164,0.181032,0.092497,-0.030684,0.043361,-0.183298,-0.269932,0.135941,-0.168104,-0.096879,...,0.122443,-0.064631,-0.139167,-0.071877,-0.031815,0.233808,0.030335,0.078894,-0.069249,-0.001197
A119F,-0.124513,0.176952,0.087901,-0.026231,0.034637,-0.185819,-0.269177,0.130894,-0.166694,-0.095545,...,0.118637,-0.065825,-0.134654,-0.066660,-0.039572,0.226804,0.029643,0.081169,-0.072838,-0.004586
A119G,-0.123095,0.182296,0.087785,-0.027831,0.043948,-0.178023,-0.262697,0.130036,-0.169663,-0.095976,...,0.123887,-0.067387,-0.132443,-0.069620,-0.027285,0.229163,0.031673,0.082154,-0.074800,-0.003709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D410N,-0.128026,0.180692,0.092301,-0.029117,0.046686,-0.183159,-0.269162,0.135114,-0.167410,-0.096108,...,0.126412,-0.066905,-0.136719,-0.069978,-0.032681,0.234282,0.030604,0.083435,-0.072538,0.000146
D410P,-0.126524,0.180602,0.093964,-0.028538,0.048755,-0.181507,-0.274156,0.132717,-0.166532,-0.096384,...,0.125958,-0.069434,-0.134918,-0.069817,-0.030840,0.236309,0.031347,0.084824,-0.074708,-0.000235
D410Q,-0.127596,0.180010,0.093224,-0.029471,0.048839,-0.182952,-0.272885,0.136663,-0.166648,-0.095799,...,0.126935,-0.068283,-0.135574,-0.069417,-0.033540,0.234248,0.031060,0.085120,-0.074245,-0.000479
D410R,-0.126604,0.178888,0.094529,-0.029755,0.048842,-0.181530,-0.273819,0.135348,-0.167357,-0.096234,...,0.127477,-0.071593,-0.135643,-0.071091,-0.034883,0.231742,0.035529,0.085570,-0.071698,-0.000946


In [43]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels.loc[embeddings.index.tolist(), 'activity'], test_size=0.2)

In [44]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [45]:
kernel = ConstantKernel (1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1)
gp = GaussianProcessRegressor(
    kernel = kernel,
    alpha = 0.0,
    n_restarts_optimizer=10,
    random_state=42
)
gp.fit(X_train, y_train)

In [46]:
y_pred, y_std = gp.predict(X_test, return_std=True)

In [49]:
y_test

variant
D21K     2.567281
A13E     1.756801
D113H    1.275165
C327A    2.370524
D380W    1.669986
           ...   
A134F    2.966066
A128G    2.616285
D192Q    2.621318
A200W    1.347461
A202Y    2.685137
Name: activity, Length: 200, dtype: float64

In [47]:
print(f"R^2 Score: {r2_score(y_test, y_pred): .2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred): .2f}")

R^2 Score:  0.32
MAE:  0.48
