In [385]:
import pandas as pd
import numpy as np
import math

def read_pdb_to_dataframe(pdb_path, model_index=1, parse_header=True):
    from biopandas.pdb import PandasPdb
    from prody import parsePDBHeader

    atomic_df = PandasPdb().read_pdb(pdb_path)
    if parse_header:
        header = parsePDBHeader(pdb_path)
    else:
        header = None
    atomic_df = atomic_df.get_model(model_index)
    if len(atomic_df.df["ATOM"]) == 0:
        raise ValueError(f"No model found for index: {model_index}")

    return atomic_df.df["ATOM"], header

def get_approximation(A, r):
    U, S, Vh = np.linalg.svd(A, full_matrices=True)
    return U[:, :r] @ np.diag(S[:r]) @ Vh[:r, :]

def NystromMatrix(K, m):
    A = K[:m, :m]
    B = K[:m, m:]
    C = K[m:, m:]

    A_tilda = get_approximation(A, m)
    C_tilda = B.T @ np.linalg.pinv(A_tilda) @ B
    return np.block([[A, B], [B.T, C_tilda]])
    
def relative_error(A, A_tilda, norm):
    eps = np.linalg.norm(A - A_tilda, ord=norm)
    return round(eps / np.linalg.norm(A, ord=norm), 6)

def procrustes(P1, P2):
    U, S, Vh = np.linalg.svd(P2.T @ P1, full_matrices=True)
    return Vh.T @ U.T

def rmse(A, B):
    assert(A.shape == B.shape)
    norms = [np.linalg.norm(A[i] - B[i]) for i in range(601)]
    return np.mean(norms)
    # return np.sqrt(np.mean(np.sum((A - B)**2, axis=0)))

def sparse(X, A, B, G):
    import cvxpy 
    Weight_B1 = np.zeros((m-1, n))
    for i in range(n):
        Weight_B1[:r, i] = 1

    Weight = np.vstack((Weight_B1, np.zeros(n)))

    I,J = np.nonzero(Weight)
    M = np.repeat(m-1, J.size)

    Xr = X
    Yr = cvxpy.Variable(shape=(r,n))

    Xty = Xr.T @ Yr

    constraints = [Xty[I, J] - Xty[M, J] == B[I, J] - B[M, J]]

    # dummy minimization
    p = cvxpy.Problem(cvxpy.Minimize(0), constraints)

    p.solve()

    if p.status in ["infeasible", "unbounded"]:
        raise Exception(p.status)

    Yr = Yr.value
    Br = Xr.T @ Yr

    # Approximation of G using modified Nystrom
    G_mod1 = np.block([[A, Br], [Br.T, Yr.T @ Yr]])
    G_mod2 = np.block([[A, Br], [Br.T, Br.T @ np.linalg.pinv(A) @ Br]])

    error2 = np.linalg.norm(G-G_mod1,'fro')/np.linalg.norm(G,'fro')
    error3 = np.linalg.norm(G-G_mod2,'fro')/np.linalg.norm(G,'fro')

    print(error2)
    print(error3)
    
def H(n):
    return np.eye(n) - np.ones((n, n)) / n


In [386]:
def sparse(X, A, B, G):
    import cvxpy 
    Weight_B1 = np.zeros((m-1, n))
    for i in range(n):
        Weight_B1[:r, i] = 1

    Weight = np.vstack((Weight_B1, np.zeros(n)))

    I,J = np.nonzero(Weight)
    M = np.repeat(m-1, J.size)

    Xr = X
    Yr = cvxpy.Variable(shape=(r,n))

    Xty = Xr.T @ Yr

    constraints = [Xty[I, J] - Xty[M, J] == B[I, J] - B[M, J]]

    # dummy minimization
    p = cvxpy.Problem(cvxpy.Minimize(0), constraints)

    p.solve()

    if p.status in ["infeasible", "unbounded"]:
        raise Exception(p.status)

    Yr = Yr.value
    Br = Xr.T @ Yr

    # Approximation of G using modified Nystrom
    G_mod1 = np.block([[A, Br], [Br.T, Yr.T @ Yr]])
    G_mod2 = np.block([[A, Br], [Br.T, Br.T @ np.linalg.pinv(A) @ Br]])

    error2 = np.linalg.norm(G-G_mod1,'fro')/np.linalg.norm(G,'fro')
    error3 = np.linalg.norm(G-G_mod2,'fro')/np.linalg.norm(G,'fro')

    print(error2)
    print(error3)

In [387]:
df, df_header = read_pdb_to_dataframe('1ubq-af.pdb')
df_true, df_true_header = read_pdb_to_dataframe('1ubq.pdb')
df_true = df_true[:-1]

df_sorted = df.groupby('residue_number').apply(lambda x: x.sort_values(by='atom_name')).reset_index(drop=True)
df_true_sorted = df_true.groupby('residue_number').apply(lambda x: x.sort_values(by='atom_name')).reset_index(drop=True)

P = df[["x_coord", "y_coord", "z_coord"]].values
P = P - np.mean(P, axis=0)
P_true = df_true[["x_coord", "y_coord", "z_coord"]].values
P_true = P_true - np.mean(P_true, axis=0)

R = procrustes(P_true, P)
# print(rmse(P, P_true @ R))
print(relative_error(P, P_true @ R, 'fro'))


0.18066


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  idxs["end_idx"] = ends.line_idx.values
  df_sorted = df.groupby('residue_number').apply(lambda x: x.sort_values(by='atom_name')).reset_index(drop=True)
  df_true_sorted = df_true.groupby('residue_number').apply(lambda x: x.sort_values(by='atom_name')).reset_index(drop=True)


In [388]:
n = 19
m, r = P.shape[0] - n, 3

G = P @ P.T
G = np.flip(G)
G_nys = NystromMatrix(G, m)
D, U = np.linalg.eig(G_nys)
P_nys = np.real(U[:, :r] @ np.sqrt(np.diag(D[:r])))
P_nys = P_nys - np.mean(P_nys, axis=0)
P_nys = np.flip(P_nys)

R = procrustes(P_nys, P_true)
# print(rmse(P_true, P_nys @ R))
print(relative_error(P_true, P_nys @ R, 'fro'))

0.179615


In [389]:
from scipy.spatial import distance_matrix
P1 = np.flip(P)
D = distance_matrix(P1, P1)
D_nys = NystromMatrix(D, m)
A = - (D_nys * D_nys) / 2
B = H(m+n) @ A @ H(m+n)
D, U = np.linalg.eig(B)
X = np.real(U[:, :r] @ np.sqrt(np.diag(D[:r])))
X = np.flip(X)
R = procrustes(X, P_true)
# print(rmse(P_true, X @ R))
print(relative_error(P_true, X @ R, 'fro'))


0.179613


In [390]:
X = P[:m, :]
Y = P[m:,:]
A = X @ X.T
B = X @ Y.T
C = Y @ Y.T
sparse(X.T, A.T, B, G)

# sparse() only works for gram matrices



1.266401767911556
1.266417813912098


    Your problem is being solved with the ECOS solver by default. Starting in 
    CVXPY 1.5.0, Clarabel will be used as the default solver instead. To continue 
    using ECOS, specify the ECOS solver explicitly using the ``solver=cp.ECOS`` 
    argument to the ``problem.solve`` method.
    


In [391]:
# TODO: sparse code for distance from abiy
# TODO: rotate via only the good residue groups?