In [73]:
import pandas as pd
from biopandas.pdb import PandasPdb
from prody import parsePDBHeader
from typing import Optional

def read_pdb_to_dataframe(
    pdb_path: Optional[str] = None,
    model_index: int = 1,
    parse_header: bool = True,
    ) -> pd.DataFrame:
    """
    Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.

    Args:
        pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.
        model_index (int, optional): Index of the model to extract from the PDB file, in case
            it contains multiple models. Defaults to 1.
        parse_header (bool, optional): Whether to parse the PDB header and extract metadata.
            Defaults to True.

    Returns:
        pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, with one row
            per atom
    """
    atomic_df = PandasPdb().read_pdb(pdb_path)
    if parse_header:
        header = parsePDBHeader(pdb_path)
    else:
        header = None
    atomic_df = atomic_df.get_model(model_index)
    if len(atomic_df.df["ATOM"]) == 0:
        raise ValueError(f"No model found for index: {model_index}")

    return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]]), header


import numpy as np
import math

def k(x, y, sig=1):
    r = np.linalg.norm(x - y)
    return np.exp(-0.5 * np.square(r) / np.square(sig))

def GaussianKernel(X, sig=1):
    n = X.shape[0]
    kernel = np.empty((n,n))
    for i in range(n):
        for j in range(n):
            kernel[i, j] = k(X[i], X[j], sig)
    return kernel / n
    # return kernel

def get_approximation(A, r):
    U, S, Vh = np.linalg.svd(A, full_matrices=True)
    return U[:, :r] @ np.diag(S[:r]) @ Vh[:r, :]

def NystromMatrix(K, m, l):
    A = K[:m, :m]
    B = K[:m, m:]
    C = K[m:, m:]

    A_tilda = get_approximation(A, l)
    C_tilda = B.T @ np.linalg.pinv(A_tilda) @ B
    return np.block([[A, B], [B.T, C_tilda]])
    
def relative_error(A, A_tilda, norm):
    eps = np.linalg.norm(A - A_tilda, ord=norm)
    return eps / np.linalg.norm(A, ord=norm)

def procrustes(P1, P2):
    U, S, Vh = np.linalg.svd(P2.T @ P1, full_matrices=True)
    return Vh.T @ U.T

def rmse(A, B):
    return np.sqrt(np.mean((A - B) ** 2))

def sparse(X, A, B, G):
    import cvxpy 
    Weight_B1 = np.zeros((m-1, n))
    for i in range(n):
        Weight_B1[:r, i] = 1

    Weight = np.vstack((Weight_B1, np.zeros(n)))

    I,J = np.nonzero(Weight)
    M = np.repeat(m-1, J.size)

    Xr = X
    Yr = cvxpy.Variable(shape=(r,n))

    Xty = Xr.T @ Yr

    constraints = [Xty[I, J] - Xty[M, J] == B[I, J] - B[M, J]]

    # dummy minimization
    p = cvxpy.Problem(cvxpy.Minimize(0), constraints)

    p.solve()

    if p.status in ["infeasible", "unbounded"]:
        raise Exception(p.status)

    Yr = Yr.value
    Br = Xr.T @ Yr

    # Approximation of G using modified Nystrom
    G_mod1 = np.block([[A, Br], [Br.T, Yr.T @ Yr]])
    G_mod2 = np.block([[A, Br], [Br.T, Br.T @ np.linalg.pinv(A) @ Br]])

    error2 = np.linalg.norm(G-G_mod1,'fro')/np.linalg.norm(G,'fro')
    error3 = np.linalg.norm(G-G_mod2,'fro')/np.linalg.norm(G,'fro')

    print(error2)
    print(error3)

In [35]:
df, df_header = read_pdb_to_dataframe('ranked_0.pdb')
df.sort_values(by="b_factor", axis=0, ascending=False, inplace=True)
P = df[["x_coord", "y_coord", "z_coord"]].values
P = P - np.mean(P, axis=0)
# plddt = { residue : score for residue, score in zip(df['residue_number'], df['b_factor']) }

# from scipy.spatial import distance_matrix
# D = distance_matrix(X, X)
# df[df["b_factor"] < 70]
# df.head(100)

In [85]:
n = 1200
m, r = P.shape[0] - n, 3

# create Gram matrix
X = P[:m, :]
Y = P[m:,:]
A = X @ X.T
B = X @ Y.T
C = Y @ Y.T
G = np.block([[A, B], [B.T, C]])

G = np.rot90(G)
G = np.rot90(G)


In [86]:
# nystrom
G_nys = NystromMatrix(G, m, r)
# print(np.max(np.abs(G - G_nys)))
# print(np.sum(np.abs(np.sum(G_nys))))
D, U = np.linalg.eig(G_nys)
P_nys = np.real(U[:, :r] @ np.sqrt(np.diag(D[:r])))
# P_nys = P_nys - np.mean(P_nys, axis=0) not needed
P_nys = np.flip(P_nys)
R = procrustes(P_nys, P)
print(rmse(P, P_nys @ R))

1.7991706737579295e-13


In [7]:
# sparse
# print(X.T.shape, A.T.shape, B.T.shape)
sparse(X.T, A.T, B, G)

    Your problem is being solved with the ECOS solver by default. Starting in 
    CVXPY 1.5.0, Clarabel will be used as the default solver instead. To continue 
    using ECOS, specify the ECOS solver explicitly using the ``solver=cp.ECOS`` 
    argument to the ``problem.solve`` method.
    


4.1658773692723906e-14
7.505715349839378e-06
