In [6]:
import pandas as pd
import numpy as np
import math

def read_pdb_to_dataframe(pdb_path, model_index=1, parse_header=True):
    from biopandas.pdb import PandasPdb
    from prody import parsePDBHeader

    atomic_df = PandasPdb().read_pdb(pdb_path)
    if parse_header:
        header = parsePDBHeader(pdb_path)
    else:
        header = None
    atomic_df = atomic_df.get_model(model_index)
    if len(atomic_df.df["ATOM"]) == 0:
        raise ValueError(f"No model found for index: {model_index}")

    return atomic_df.df["ATOM"], header

def get_approximation(A, r):
    U, S, Vh = np.linalg.svd(A, full_matrices=True)
    return U[:, :r] @ np.diag(S[:r]) @ Vh[:r, :]

def NystromMatrix(K, m):
    A = K[:m, :m]
    B = K[:m, m:]
    C = K[m:, m:]

    A_tilda = get_approximation(A, m)
    C_tilda = B.T @ np.linalg.pinv(A_tilda) @ B
    return np.block([[A, B], [B.T, C_tilda]])
    
def relative_error(A, A_tilda, norm):
    eps = np.linalg.norm(A - A_tilda, ord=norm)
    return round(eps / np.linalg.norm(A, ord=norm), 6)

def procrustes(P1, P2):
    U, S, Vh = np.linalg.svd(P2.T @ P1, full_matrices=True)
    return Vh.T @ U.T

def rmse(A, B):
    assert(A.shape == B.shape)
    norms = [np.linalg.norm(A[i] - B[i]) for i in range(601)]
    return np.mean(norms)
    # return np.sqrt(np.mean(np.sum((A - B)**2, axis=0)))

def sparse(X, A, B, G):
    import cvxpy 
    Weight_B1 = np.zeros((m-1, n))
    for i in range(n):
        Weight_B1[:r, i] = 1

    Weight = np.vstack((Weight_B1, np.zeros(n)))

    I,J = np.nonzero(Weight)
    M = np.repeat(m-1, J.size)

    Xr = X
    Yr = cvxpy.Variable(shape=(r,n))

    Xty = Xr.T @ Yr

    constraints = [Xty[I, J] - Xty[M, J] == B[I, J] - B[M, J]]

    # dummy minimization
    p = cvxpy.Problem(cvxpy.Minimize(0), constraints)

    p.solve()

    if p.status in ["infeasible", "unbounded"]:
        raise Exception(p.status)

    Yr = Yr.value
    Br = Xr.T @ Yr

    # Approximation of G using modified Nystrom
    G_mod1 = np.block([[A, Br], [Br.T, Yr.T @ Yr]])
    G_mod2 = np.block([[A, Br], [Br.T, Br.T @ np.linalg.pinv(A) @ Br]])

    error2 = np.linalg.norm(G-G_mod1,'fro')/np.linalg.norm(G,'fro')
    error3 = np.linalg.norm(G-G_mod2,'fro')/np.linalg.norm(G,'fro')

    print(error2)
    print(error3)
    
def H(n):
    return np.eye(n) - np.ones((n, n)) / n


In [11]:
df, df_header = read_pdb_to_dataframe('2pyb-af.pdb')
df_true, df_true_header = read_pdb_to_dataframe('2pyb.pdb')


S, S_true = set(), set()
for index, r in df.iterrows():
    S.add(r['atom_name'])
for index, r in df_true.iterrows():
    S_true.add(r['atom_name'])
    
df_true


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  idxs["end_idx"] = ends.line_idx.values


Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx,model_id
0,ATOM,1,,N,,ASP,,A,11,,...,8.679,-36.651,1.0,40.72,,,N,,662,1
1,ATOM,2,,CA,,ASP,,A,11,,...,7.273,-36.674,1.0,42.26,,,C,,663,1
2,ATOM,3,,C,,ASP,,A,11,,...,6.553,-35.350,1.0,43.43,,,C,,664,1
3,ATOM,4,,O,,ASP,,A,11,,...,5.827,-34.812,1.0,45.87,,,O,,665,1
4,ATOM,5,,CB,,ASP,,A,11,,...,6.506,-37.870,1.0,41.55,,,C,,666,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4979,ATOM,4983,,CB,,ASP,,D,161,,...,38.851,33.843,1.0,38.94,,,C,,5644,1
4980,ATOM,4984,,CG,,ASP,,D,161,,...,37.884,34.678,1.0,38.94,,,C,,5645,1
4981,ATOM,4985,,OD1,,ASP,,D,161,,...,38.021,35.922,1.0,36.17,,,O,,5646,1
4982,ATOM,4986,,OD2,,ASP,,D,161,,...,36.974,34.101,1.0,35.66,,,O,,5647,1
