# Imports

In [None]:
import mdtraj as md
import numpy as np
from datetime import datetime
from tqdm import tqdm
from pathlib import Path
import yaml
import plotly.express as px

# Analyze Split PDBs

In [None]:
idir = Path("/Users/alexpayne/lilac-mount-point/asap-datasets-ap/sars_01_prepped_v3_split/")

In [None]:
pdbs = list(idir.glob("*active_site.pdb"))

In [None]:
len(pdbs)

In [None]:
ts = [md.load(pdb) for pdb in tqdm(pdbs)]

In [None]:
ids = np.array(list([pdbs[i] for i, t in enumerate(ts)]))

In [None]:
lens = np.array(list([t.n_atoms for i, t in enumerate(ts)]))

In [None]:
ids[lens == 354]

In [None]:
sum(lens == 354)

In [None]:
ids[lens != 354]

In [None]:
same_atoms = ids[lens == 354]

In [None]:
combined = md.load(list(same_atoms))

In [None]:
combined.save_pdb("527_combined.pdb")

In [None]:
combined = md.load("527_combined.pdb")

## calculate RMSD

In [None]:
rmsd_array = []
for i in range(combined.n_frames):
    rmsd_array.append(md.rmsd(combined, combined, frame=i))
rmsd_matrix = np.matrix(rmsd_array)

In [None]:
np.save("527_rmsd_array.npy", rmsd_matrix)

In [None]:
np.shape(rmsd_array)

In [None]:
np.shape(rmsd_matrix)

In [None]:
rmsd_matrix * 10

In [None]:
rmsd_matrix.max()

In [None]:
rmsd_matrix.min()

In [None]:
np.shape(rmsd_matrix)

## is matrix symmetric?

In [None]:
np.all(rmsd_matrix - rmsd_matrix.T) < 0.00001

### Yes!

## Plot RMSD Matrix

In [None]:
rmsds = np.triu(rmsd_matrix).flatten()
rmsds = rmsds[rmsds.nonzero()]

In [None]:
import plotly.express as px

In [None]:
px.histogram(rmsds)

## RMSD Matrix to dataframe

In [None]:
import pandas as pd
df = pd.DataFrame(rmsd_matrix)

In [None]:
# <https://stackoverflow.com/questions/60082349/pivot-table-to-tidy-data-frame-in-pandas>
tidydf = df.rename_axis(index='V1',columns="V2").reset_index().melt("V1")

In [None]:
tidydf.columns = ['i','j','RMSD']

### plot using plotly

In [None]:
fig = px.scatter(tidydf, x='i', y='j', color='RMSD')

In [None]:
fig.write_image("test.png")

# Compute Backbone Dihedrals

In [None]:
phi = md.compute_phi(combined)

In [None]:
np.shape(phi[1])

In [None]:
phi[1]

# Cluster

In [None]:
from sklearn.decomposition import PCA
import numpy as np

In [None]:
rmsd_matrix = np.load("527_rmsd_array.npy")

## calculate PCA

In [None]:
pca = PCA(n_components='mle')

In [None]:
pca.fit(rmsd_matrix)

In [None]:
pca.n_components

In [None]:
pca.explained_variance_

## plot with matplotlib

In [None]:
import matplotlib.pyplot as plt
cum_sum_exp = np.cumsum(pca.explained_variance_)
plt.bar(range(0,len(pca.explained_variance_)), pca.explained_variance_, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_exp)), cum_sum_exp, where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
pca.n_components_

In [None]:
transformed = pca.transform(rmsd_matrix)

In [None]:
np.shape(transformed)

In [None]:
np.shape(transformed[:,0:2])

In [None]:
plt.scatter(transformed[:,0], transformed[:,1])

In [None]:
plt.scatter(transformed[:,0], transformed[:,2])

In [None]:
plt.scatter(transformed[:,1], transformed[:,2])

## kmeans cluster

In [None]:
plt.scatter(transformed[:,2], transformed[:,0])

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=10)

In [None]:
kmeans.fit(rmsd_matrix)

In [None]:
kmeans.labels_

## plot with labels

In [None]:
plt.scatter(transformed[:,0], transformed[:,1], c=kmeans.labels_)

In [None]:
plt.scatter(transformed[:,0], transformed[:,2], c=kmeans.labels_)

In [None]:
plt.scatter(transformed[:,1], transformed[:,2], c=kmeans.labels_)

In [None]:
x = 5
y = 5
fig, axs = plt.subplots(x,y)
for i in range(x):
    for j in range(y):
        ax = axs[i,j]
        ax.scatter(transformed[:,i], transformed[:,j], c=kmeans.labels_)

In [None]:
fig

## convert to pandas array

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(transformed[:, 0:5])

In [None]:
df["Cluster"] = kmeans.labels_

In [None]:
df

In [None]:
pd.plotting.scatter_matrix(df)

In [None]:
import plotly.express as px

In [None]:
fig = px.scatter_matrix(df, dimensions=[0,1,2,3,4], color='Cluster')

In [None]:
fig.show()