<a href="https://colab.research.google.com/github/acarbn/portfolio/blob/main/DifVecvsMSF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install biopython py3Dmol

Collecting py3Dmol
  Downloading py3Dmol-2.4.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading py3Dmol-2.4.2-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: py3Dmol
Successfully installed py3Dmol-2.4.2


In [None]:
from Bio.PDB import *
from google.colab import drive
import sys
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd

# Add utils path
drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/portfolio')
from utilsclass import StructureComparer, GNM

apoPDB   = "1AKE"
apoChain = 'A'
holoPDB  = "4AKE"
holoChain= 'A'
mode_set = list(range(1, 11))
rcut_gnm = 10

comparer = StructureComparer(
    PDBmobile=apoPDB,
    chainmobile=apoChain,
    PDBref=holoPDB,
    chainref=holoChain
)

comparer.parse_structures()
comparer.align_structures()

# compute your difference vector once
difvec = comparer.diff_vector()
print("Difference vector (difvec):", difvec)
indices, distances = zip(*difvec)
#comparer.plot_diff_vector(window=25)

rows = int(np.ceil(len(mode_set) / 2))

fig = make_subplots(
    rows=rows,
    cols=2,
    subplot_titles=[f"Mode {i}" for i in mode_set],
    shared_xaxes=False,
    shared_yaxes=False,
    horizontal_spacing=0.15,
    vertical_spacing=0.08
)

# Normalize DIFVEC
dv_arr = np.asarray(distances)
area = np.trapz(dv_arr, dx=1)
dv_arr_normalized = dv_arr / area
cos_sims=[]
# loop over modes, compute MSF, then dot(difvec, MSF) and plot
for idx, i in enumerate(mode_set):
    mode_no = [i]
    GNMcall = GNM(apoPDB, mode_no, apoChain, rcut_gnm)
    GNMcall.eigens()
    if i == mode_set[0]:
        print(f"{apoPDB} structure is found.")

    MSF = GNMcall.msf()   # MSF should be a 1D array or list same length as difvec
    print(f"MSF data for mode {i} is calculated.")
    if i == mode_set[0]:
      residue_indices = np.asarray(MSF['ResidueNo']).astype(int)

    # ensure both are numpy arrays
    msf_arr = np.asarray(MSF['MSF'])
    dv_arr  = np.asarray(distances)
    if msf_arr.shape != dv_arr.shape:
        raise ValueError(f"Shape mismatch: difvec {dv_arr.shape}, MSF {msf_arr.shape}")

    cos_sim = np.dot(dv_arr, msf_arr) / (np.linalg.norm(dv_arr) * np.linalg.norm(msf_arr))
    cos_sims.append(cos_sim)

    row = idx // 2 + 1
    col = idx % 2 + 1

    # Normalized DIFVEC trace
    fig.add_trace(
        go.Scatter(
            x=residue_indices,
            y=dv_arr_normalized,
            mode='lines',
            name='Normalized Difference Vector',
            line=dict(color='red'),
            showlegend=(idx == 0)
        ),
        row=row,
        col=col
    )
    # MSF trace
    fig.add_trace(
        go.Scatter(
            x=residue_indices,
            y=msf_arr,
            mode='lines',
            name=f'MSF Mode',
            line=dict(color='blue'),
            showlegend=(idx == 0)
        ),
        row=row,
        col=col
    )


     # Add cosine similarity annotation
    fig.add_annotation(
        text=f"Similarity: {cos_sim:.3f}",
        x=0.01,
        y=0.999,
        showarrow=False,
        font=dict(size=12),
        xref="x domain",       # use the subplot’s own x-domain
        yref="y domain",
        row=row,
        col=col
    )
fig.update_layout(
    height=300 * rows,
    width=900,
    title_text="MSF vs Normalized Difference Vector per Mode",
    legend=dict(
        orientation="h",  # horizontal layout
        xanchor="center",
        yanchor="bottom",
        x=0.5,
        y=1,
    ),
    showlegend=True
)

window=25
fig.update_xaxes(
    title_text="Residue Index",
    tickmode='linear',
    tick0=0,
    dtick=window
)
fig.update_yaxes(title_text="Value")

fig.show()

df = pd.DataFrame({
    'modeno':   mode_set,
    'cos_sim':  cos_sims
})
display(df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Structure exists: './pdb4ake.ent' 
Structure exists: './pdb1ake.ent' 
Structures parsed.
Ending positions of each chain:  [214]
Difference vector calculated for 214 residues.
Difference vector (difvec): [(1, 3.292536735534668), (2, 2.8630664348602295), (3, 2.1270089149475098), (4, 1.438724398612976), (5, 1.3739780187606812), (6, 1.4499626159667969), (7, 2.7076241970062256), (8, 2.9684696197509766), (9, 2.9090142250061035), (10, 2.5445144176483154), (11, 2.808861017227173), (12, 2.448740243911743), (13, 2.115360975265503), (14, 2.89892578125), (15, 3.7500696182250977), (16, 3.716200351715088), (17, 3.0370492935180664), (18, 4.933642387390137), (19, 5.2802557945251465), (20, 4.211352825164795), (21, 4.625224590301514), (22, 6.079308032989502), (23, 6.629228591918945), (24, 6.147846698760986), (25, 5.675356388092041), (26, 4.18465518951416), (27, 3.0795092582702


`trapz` is deprecated. Use `trapezoid` instead, or one of the numerical integration functions in `scipy.integrate`.



1AKE structure is found.
MSF data for mode 1 is calculated.
MSF data for mode 2 is calculated.
MSF data for mode 3 is calculated.
MSF data for mode 4 is calculated.
MSF data for mode 5 is calculated.
MSF data for mode 6 is calculated.
MSF data for mode 7 is calculated.
MSF data for mode 8 is calculated.
MSF data for mode 9 is calculated.
MSF data for mode 10 is calculated.


Unnamed: 0,modeno,cos_sim
0,1,0.722445
1,2,0.61195
2,3,0.541466
3,4,0.423809
4,5,0.487345
5,6,0.237294
6,7,0.337681
7,8,0.398821
8,9,0.371579
9,10,0.265826
