In [1]:
from PDB_Order_Fixer import PDB_Order_Fixer
import mdtraj as md
import os
import numpy as np
import h5py

import datetime
import glob
import copy
from functools import partial 
import operator
import time

import random 
import subprocess
from subprocess import Popen
import sys
from io_functions import *
from custom_clusterer import *
from custom_tica import *
from custom_featurizer import *
from pdb_editing import *
from analysis import *
from io_functions import *
#from topology_fixing import *
from subsampling import *
from conversions import *
from custom_msm import *
from grids import *

In [2]:
%matplotlib inline

import matplotlib.pyplot as plt

import matplotlib as mpl
from seaborn.apionly import set_palette
from IPython.display import set_matplotlib_formats

# configure plotting
set_matplotlib_formats('pdf', 'svg')
set_palette('Set1', n_colors=15, desat=None)



In this iPython notebook, we will featurize MOR ligand binding simulation by pairwise distances between the ligand and different receptor residues. We will then perform tICA and prospectively build an MSM. 

In [3]:
from detect_intermediates import *
from interpret_tICs import *

we are operating on biox3


In [4]:
from mor_h8_feature_types import *
from get_variable_names import *
from mor_ligand_atom_residue_tica_config import *
from residue import Residue, Atom
ori_feature_name = copy.deepcopy(feature_name)

tm6_tm3_residues
[R279, R165]
[65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 273, 274, 275, 276, 277, 278,

In [5]:
rho = 0.0025
rho_string = "0pt0025"

In [6]:
schemes = ["closest-heavy", "CA"]
feature_name = "%s-CA" %ori_feature_name
(active_ref_dir, inactive_ref_dir, simulation_ref_dir, scripts_dir,
          ligand_dir, agonist_dir, inverse_agonist_dir, biased_agonist_dir, ref_receptors_dir, whole_trajectory_pnas,
          sasa_file) = get_base_files(base)

tica_dir = get_tica_dir(base, is_sparse, lag_time, n_components, feature_name, 
                                 wolf_string, shrinkage_string, rho_string)
ori_tica_dir = copy.deepcopy(tica_dir)
tica_dir = "%s-normalized" % ori_tica_dir
features_dir = get_features_dir(base, feature_name)

landmarks_dir = get_landmarks_dir(tica_dir)
analysis_dir = get_analysis_dir(tica_dir, n_clusters, sampling_method)
gmm_dir = get_gmm_dir(tica_dir)
rf_dir = get_rf_dir(tica_dir)


ref_tica_dir, ref_tica_coords = get_ref_tica_dirs(tica_dir)

graph_file = get_graph_file(tica_dir, msm_lag_time, n_clusters)

pnas_titles =  ["tm6_tm3_dist", "rmsd_npxxy_inactive", "rmsd_npxxy_active", "rmsd_connector_inactive", "rmsd_connector_active"]
pnas_features_dir = analysis_dir


(clusterer_dir, msm_model_dir, macrostate_dir, features_known, model_dir, projected_features_dir,
         projection_operator_dir, ktica_fit_model_filename, ktica_projected_data_filename, nystroem_data_filename,
         mutual_information_csv, pearson_csv) = get_tica_files(base, tica_dir, n_clusters, msm_lag_time, n_macrostates)

(standardized_features_dir, feature_residues_csv, feature_residues_pkl,
          contact_csv, ref_features_dir) = get_feature_files(features_dir)

(kmeans_csv, tica_coords_csv, features_csv, active_rmsd_dir, inactive_rmsd_dir, active_pnas_dir, inactive_pnas_joined, active_pnas_joined,
        clusters_map_file, ktica_clusters_map_file, analysis_file, combined_file, docking_summary, docking_joined, docking_z_scores_csv,
        aggregate_docking, aggregate_docking_joined, docking_pnas_joined, aggregate_docking_pnas, aggregate_docking_pnas_joined, docking_multiple_ligands,
        docking_distances_file, docking_pdf, mmgbsa_docking_distances, pnas_coords, mmgbsa_dir, mmgbsa_csv, mmgbsa_pdf, aggregate_mmgbsa,
        aggregate_mmgbsa_joined, aggregate_mmgbsa_pnas_joined, mmgbsa_z_scores_csv, active_clusters_csv, intermediate_clusters_csv,
        inactive_clusters_csv, pnas_clusters_averages, tica_clusters_averages, tica_classes_csv, tica_samples_csv, subgraph_save_base,
        degree_save_base, degree_map_csv, degree_z_map_csv, aggregate_docking_pnas_degree_z_joined, tic_residue_csv, feature_coefs_csv,
        duplicated_feature_coefs_csv) = get_analysis_files(analysis_dir, n_clusters, tica_dir, tica_dir, sampling_method, n_samples, precision,
                                                           msm_lag_time)

(inactive_pnas_distances_dir, active_pnas_distances_dir, active_pnas_all_distances_dir,
          inactive_pnas_distances_new_csv, active_pnas_distances_new_csv, active_pnas_joined, active_pnas_means, pnas_coords_dir,
          pnas_coords_csv, pnas_all_coords_csv, pnas_coords_hexbin_dir, pnas_coords_co_crystallized_docking_dir,
          pnas_coords_active_colors_dir, user_defined_features_file, reaction_coordinates_trajs_file) = get_pnas_files(whole_trajectory_pnas, pnas_features_dir)

features_dir = get_features_dir(base, feature_name)



graph_file = get_graph_file(tica_dir, msm_lag_time, n_clusters)
(scripts_dir, pymol_fixpdb_dir) = get_script_dir(scripts_dir)
(save_dir, reimaged_dir, mae_dir, combined_reimaged_dir, grid_dir, docking_dir) = get_docking_dirs(tica_dir, n_clusters, n_components, n_samples, sampling_method, precision)


/home/enf/md_simulations/MOR/ligand_binding/h5_trajectories/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA
/home/enf/md_simulations/MOR/ligand_binding/h5_trajectories/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA


In [7]:
print feature_residues_pkl

/home/enf/md_simulations/MOR/ligand_binding/h5_trajectories/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/feature_residues.pkl


In [8]:
contact_residues

[ILE198,
 ILE69,
 ARG211,
 ASN127,
 ILE193,
 ASN332,
 ARG345,
 THR132,
 VAL94,
 VAL92,
 PRO134,
 THR220,
 GLY85,
 ARG179,
 HIS297,
 GLY82,
 GLU229,
 ILE215,
 THR67,
 MET99,
 LEU116,
 LEU112,
 ALA113,
 LEU110,
 ALA323,
 CYS159,
 LEU194,
 LEU88,
 ILE322,
 LEU83,
 ASN342,
 VAL245,
 LEU219,
 GLN314,
 LYS185,
 ALA287,
 SER125,
 ALA197,
 ASN328,
 ARG182,
 ASN150,
 VAL187,
 TYR210,
 VAL80,
 VAL81,
 ALA115,
 ILE107,
 VAL89,
 ILE105,
 TYR166,
 TYR299,
 THR120,
 THR70,
 ALA117,
 MET130,
 CYS79,
 VAL334,
 GLN124,
 SER329,
 LYS174,
 ALA111,
 GLU341,
 ILE298,
 ASN230,
 PRO244,
 VAL236,
 ILE296,
 PRO295,
 GLY136,
 GLY199,
 PHE123,
 SER154,
 ALA184,
 CYS170,
 LEU74,
 PRO172,
 MET65,
 PRO224,
 PRO122,
 VAL78,
 VAL282,
 THR153,
 THR327,
 VAL163,
 THR157,
 LEU139,
 ASP216,
 SER317,
 PHE204,
 LYS100,
 THR225,
 PHE152,
 LEU335,
 ASP114,
 PHE156,
 PRO201,
 LEU331,
 MET90,
 VAL189,
 HIS171,
 TYR96,
 LEU232,
 LEU231,
 LEU339,
 PHE135,
 SER145,
 PRO309,
 TYR148,
 ALA175,
 CYS292,
 TYR149,
 THR180,
 CYS217,
 I

In [None]:
from ipyparallel import Client
rc = Client()
print(len(rc.ids))
dview = rc[:]
dview.map(os.chdir, ['/home/enf/b2ar_analysis/conformation']*len(rc.ids))

In [None]:
featurize_contacts_custom(traj_dir, features_dir = features_dir, traj_ext = traj_ext, contact_residue_pairs_file = feature_residues_pkl, structures=[inactive_dir, active_dir], contact_residues=contact_residues,
                          residues_map = None, contact_cutoff = cutoff, parallel = True, exacycle = exacycle, traj_top_structure = None, iterative=False,
                          user_specified_atom_residue_pairs = [], load_from_file=False, worker_pool=dview, schemes=schemes)

In [7]:
import pickle
with open(feature_residues_pkl, "rb") as f:
    feature_residues = pickle.load(f)

In [None]:
fit_and_transform(features_directory = features_dir, model_dir = tica_dir, stride=5, lag_time = lag_time, n_components = n_components, sparse = sparse, wolf = wolf, rho = rho, shrinkage = shrinkage, parallel=True, traj_ext = traj_ext, normalize=True)

In [None]:
featurize_contacts_custom(ref_receptors_dir, features_dir = ref_features_dir, traj_ext = ".pdb", contact_residue_pairs_file = feature_residues_pkl, structures=[inactive_dir, active_dir], contact_residues=contact_residues,
                          residues_map = None, contact_cutoff = cutoff, parallel = True, exacycle = exacycle, traj_top_structure = None, iterative=False,
                          user_specified_atom_residue_pairs = [], load_from_file=False, worker_pool=None, schemes=schemes)

In [None]:
transform(existing_model = projection_operator_dir, features_directory = ref_features_dir, tica_dir = ref_tica_dir)


In [None]:
tica_object = verboseload(projection_operator_dir)

In [None]:
tica_object.timescales_

In [None]:
plot_columns(tica_dir, projected_features_dir, titles = ["tIC%d" %j for j in range(1,11)], tICA = True, scale = 1.0, refcoords_file = None)

In [8]:
tica_coords = verboseload(projected_features_dir)
pnas_coords = verboseload(user_defined_features_file)
tica_names = ["tIC.%d" %i for i in range(1,n_components+1)]
pnas_names = sorted(feature_name_residues_dict.keys())

loading "/home/enf/md_simulations/MOR/h8_reimaged/sparse-tICA_t5_n_components10all_residues_4dkl_5c1m_under_cutoff6A-CA_regularization_wolf_autoShrinkage0pt0025-normalized/phi_psi_chi2_allprot_projected.h5"...
loading "/home/enf/md_simulations/MOR/h8_reimaged/all_pnas_features/user_defined_features.h5"...


In [9]:
from plots import *

In [None]:
plot_data_vs_data(np.concatenate(tica_coords), np.concatenate(pnas_coords), tica_names, pnas_names, analysis_dir)

In [None]:
tica_coords = verboseload(projected_features_dir)
np.shape(np.concatenate(tica_coords))

In [None]:
f = load_file("/home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/apo_rep_167.dataset")

In [None]:
np.shape(f)

In [10]:
subsampled_features_dir = os.path.join(tica_dir, "subsampled_features")
if not os.path.exists(subsampled_features_dir): os.makedirs(subsampled_features_dir)

In [13]:
tic_components_dir = tica_dir
important_contact_features = interpret_tIC_components(projection_operator_dir, tic_components_dir, feature_residues_pkl, n_tica_components=n_components, percentile=95)


loading "/home/enf/md_simulations/MOR/h8_reimaged/sparse-tICA_t5_n_components10all_residues_4dkl_5c1m_under_cutoff6A-CA_regularization_wolf_autoShrinkage0pt0025-normalized/phi_psi_chi2_allprot_tica_coords.h5"...
Interpreting tIC 1
feature_importances_df.shape
(10, 7)
residue_importances_df.shape
(14, 3)
          feature_name      res_i      res_j  resid_i  resid_j  importance  \
0         Leu88-Phe338      Leu88     Phe338       88      338    0.230825   
8  Leu335_Ca-Phe338_Ca  Leu335_Ca  Phe338_Ca      335      338   -0.150707   
2         Asn86-Phe338      Asn86     Phe338       86      338    0.131836   
3        Val286-Tyr336     Val286     Tyr336      286      336    0.130005   
5  Val334_Ca-Phe338_Ca  Val334_Ca  Phe338_Ca      334      338    0.122108   
9  Leu339_Ca-Tyr336_Ca  Leu339_Ca  Tyr336_Ca      339      336   -0.120023   
7  Leu335_Ca-Leu339_Ca  Leu335_Ca  Leu339_Ca      335      339    0.103351   
1        Val282-Gly253     Val282     Gly253      282      253    0.073

In [14]:
tic_subsampled_features_file = "%s/features_subsampled.pkl" % tica_dir
print tic_subsampled_features_file
subsampled_features_dir = os.path.join(tica_dir, "subsampled_features")
if not os.path.exists(subsampled_features_dir): os.makedirs(subsampled_features_dir)

/home/enf/md_simulations/MOR/h8_reimaged/sparse-tICA_t5_n_components10all_residues_4dkl_5c1m_under_cutoff6A-CA_regularization_wolf_autoShrinkage0pt0025-normalized/features_subsampled.pkl


In [24]:
import interpret_tICs
reload(interpret_tICs)
from interpret_tICs import *
important_contact_features_pruned, important_contact_features_indices = find_non_zero_features(important_contact_features, feature_residues)
subsample_features(features_dir, important_contact_features_indices, important_contact_features_pruned, tic_subsampled_features_file)

loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/apo_rep_1.dataset
loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/apo_rep_113.dataset
loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/apo_rep_128.dataset
loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/apo_rep_142.dataset
loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/apo_rep_157.dataset
loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/apo_rep_171.dataset
loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/apo_rep_20.dataset
loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/apo_rep_186.dataset
loading /home/enf/md_simulations/MOR/h8_reimaged/featuresal

In [22]:
protein_top_features = load_file(tic_subsampled_features_file)



loading /home/enf/md_simulations/MOR/h8_reimaged/sparse-tICA_t5_n_components10all_residues_4dkl_5c1m_under_cutoff6A-CA_regularization_wolf_autoShrinkage0pt0025-normalized/features_subsampled.pkl


In [23]:
protein_top_features[0].iloc[0]

(ASN104_CA, THR97_CA)     0.863004
(ASN230, PHE221)          0.587581
(MET99_CA, CYS346_CA)     1.140776
(ASN86_CA, PRO333_CA)     0.597180
(VAL92_CA, LYS98_CA)      0.893472
(CYS79, TYR326)           0.628605
(VAL89_CA, PRO333_CA)     0.646526
(LYS98, ASN104)           0.516290
(ALA111_CA, PRO333_CA)    0.719197
(LEU339, LEU283)          0.960132
(THR97_CA, ARG95_CA)      0.670106
(LEU339, VAL284)          0.470025
(ASN86_CA, TYR336_CA)     1.097187
(ILE290, ILE248)          0.490037
(THR279_CA, MET281_CA)    0.518917
(LEU110, PHE289)          0.802814
(GLU341_CA, ARG280_CA)    1.401732
(SER154, ASP114)          0.264021
(THR67, TYR128)           0.673450
(ILE322, TYR128)          0.776730
(VAL187, PHE156)          0.680717
(LEU158_CA, VAL286_CA)    1.092381
(TYR128_CA, TYR75_CA)     1.049598
(LEU88, PHE343)           0.511591
(VAL81_CA, SER329_CA)     0.923446
(TRP293_CA, GLY325_CA)    0.633715
(ASN328_CA, TRP293_CA)    0.819919
(LEU339_CA, ARG280_CA)    1.172172
(MET99, PHE108)     

In [None]:
f = load_file("/home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/apo_rep_1.dataset")
np.shape(f)

In [None]:
from ipyparallel import Client
rc = Client()
dview = rc[:]
dview.map(os.chdir, ['/home/enf/b2ar_analysis/conformation']*len(rc.ids))

print(len(rc.ids))


In [23]:
bu72_trajs = []
apo_trajs = []
for i, traj in enumerate(get_trajectory_files(traj_dir, traj_ext)):
    if "bu72" in traj:
        bu72_trajs.append(i)
    elif "apo" in traj:
        apo_trajs.append(i)

user_defined_coords = load_file(user_defined_features_file)

            

loading /home/enf/md_simulations/MOR/h8_reimaged/all_pnas_features/user_defined_features.h5
loading "/home/enf/md_simulations/MOR/h8_reimaged/all_pnas_features/user_defined_features.h5"...












loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_rep_143.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_rep_114.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_reimaged_rep_215.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_reimaged_rep_23.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_reimaged_rep_244.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_reimaged_rep_71.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/features

In [24]:
bu72_pnas = [user_defined_coords[i] for i in bu72_trajs]
apo_pnas = [user_defined_coords[i] for i in apo_trajs]






loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_rep_22.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_rep_177.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_rep_191.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_rep_249.dataset

loading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_rep_32.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_rep_76.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_rep_61.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/featuresall_residues_4dkl_5c1m_under_cutoff6A-CA/bu72_rep_234.datasetloading /home/enf/md_simulations/MOR/h8_reimaged/feat

In [27]:
bu72_pnas_file = "%s/bu72.h5" % whole_trajectory_pnas
apo_pnas_file = "%s/apo.h5" % whole_trajectory_pnas

In [28]:
print(bu72_pnas_file)

/home/enf/md_simulations/MOR/h8_reimaged/all_pnas_features/bu72.h5


In [None]:
from imp import reload
import analysis
reload(analysis)
from analysis import *
import seaborn as sns
bu72_pnas_concatenated = np.concatenate(bu72_pnas)
print(np.shape(bu72_pnas_concatenated))
apo_pnas_concatenated = np.concatenate(apo_pnas)

verbosedump(bu72_pnas, bu72_pnas_file)
verbosedump(apo_pnas, apo_pnas_file)

columns=sorted(feature_name_residues_dict.keys())
#plot_columns(whole_trajectory_pnas, bu72_pnas_file, titles=columns, main = "BU72", tICA = False, scale = 1.0, refcoords_file = None)
#plot_columns(whole_trajectory_pnas, apo_pnas_file, titles=columns, main = "Apo", tICA = False, scale = 1.0, refcoords_file = None)

#df = pd.DataFrame(bu72_pnas_concatenated, columns=sorted(feature_name_residues_dict.keys()))
#sns.jointplot(x="tm6_tm3_dist", y="rmsd_npxxy_inactive", data=df, kind="kde")
#plt.hexbin(bu72_pnas_concatenated[:,0], bu72_pnas_concatenated[:,2], cmap=plt.cm.YlOrRd_r)

In [29]:
bu72_tica_file = "%s/bu72_tica.h5" % tica_dir
apo_tica_file = "%s/apo_tica.h5" % tica_dir

In [30]:
bu72_tica_file

'/home/enf/md_simulations/MOR/h8_reimaged/sparse-tICA_t5_n_components10all_residues_4dkl_5c1m_under_cutoff6A-CA_regularization_wolf_autoShrinkage0pt0025-normalized/bu72_tica.h5'

In [None]:
tica_coords = verboseload(projected_features_dir)
bu72_tica = [tica_coords[i] for i in bu72_trajs]
apo_tica = [tica_coords[i] for i in apo_trajs]

verbosedump(bu72_tica, bu72_tica_file)
verbosedump(apo_tica, apo_tica_file)

columns = ["tIC%d" %i for i in range(1,n_components+1)]
#plot_columns(tica_dir, bu72_tica_file, titles=columns, main = "BU72", tICA = False, scale = 1.0, refcoords_file = None)
#plot_columns(tica_dir, apo_tica_file, titles=columns, main = "Apo", tICA = False, scale = 1.0, refcoords_file = None)




In [None]:
n_clusters = 1000
clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters)
cluster_minikmeans(tica_dir, projected_features_dir, traj_dir, n_clusters=n_clusters, clusterer_dir=clusterer_dir)

In [None]:
bu72_clusterer_file = "%s/bu72_clusterer%d.h5" %(tica_dir, n_clusters)
apo_clusterer_file = "%s/apo_clusterer%d.h5" %(tica_dir, n_clusters)

In [None]:
clusterer = verboseload(clusterer_dir)
cluster_labels = clusterer.labels_
bu72_clusters = [cluster_labels[i] for i in bu72_trajs]
apo_clusters = [cluster_labels[i] for i in apo_trajs]
bu72_clusterer = copy.deepcopy(clusterer)
bu72_clusterer.labels_ = bu72_clusters
verbosedump(bu72_clusterer, bu72_clusterer_file)

apo_clusterer = copy.deepcopy(clusterer)
apo_clusterer.labels_ = apo_clusters
verbosedump(apo_clusterer, apo_clusterer_file)


In [None]:
bu72_clusterer_file

In [None]:
import custom_msm
reload(custom_msm)
from custom_msm import *
from msm_resampled import *

In [None]:
apo_clusters_map = make_clusters_map(apo_clusterer)

In [None]:
bu72_clusters_map = make_clusters_map(bu72_clusterer)

In [None]:
plot_timescales(bu72_clusterer_file, n_clusters, tica_dir, main="", lag_times=list(range(1,51,5)))

In [None]:
lag_time=20
prior_counts = 0.0
apo_msm_dir = os.path.join(tica_dir, "apo_msm_lag-time%d_prior-counts%s_clusters%d.h5" %(lag_time, str(prior_counts), n_clusters))
apo_msm = build_msm(apo_clusterer_file, lag_time, apo_msm_dir, prior_counts)

In [None]:
apo_msm.timescales_

In [None]:
import msm_resampled
reload(msm_resampled)
from msm_resampled import *
total_samples = 1000000
num_trajs = len(apo_trajs)
apo_resampled_traj_to_frames_file = os.path.join(tica_dir, "apo_msm_lag-time%d_prior-counts%s_clusters%d_resampled.h5" %(lag_time, str(prior_counts), n_clusters))
apo_resampled_traj_to_frames = resample_by_msm(total_samples, apo_msm, apo_clusters_map, num_trajs, apo_resampled_traj_to_frames_file)

In [None]:
apo_pnas = verboseload(apo_pnas_file)
apo_pnas_resampled_file = os.path.join(tica_dir, "apo_pnas_msm_lag-time%d_prior-counts%s_clusters%d_resampled.h5" %(lag_time, str(prior_counts), n_clusters))
resample_features_by_msm_equilibirum_pop(apo_pnas, apo_resampled_traj_to_frames, apo_pnas_resampled_file)
apo_pnas_resampled = verboseload(apo_pnas_resampled_file)

In [None]:
tica_concatenated = np.concatenate(verboseload(projected_features_dir))
tica_axes = [(np.min(tica_concatenated[:,i]), np.max(tica_concatenated[:,i])) for i in range(0, np.shape(tica_concatenated)[1])]

In [None]:
import analysis
reload(analysis)
from analysis import *

In [None]:
#apo_tica = verboseload(apo_tica_file)
#apo_tica_resampled_file = os.path.join(tica_dir, "apo_tica_msm_lag-time%d_prior-counts%s_clusters%d_resampled.h5" %(lag_time, str(prior_counts), n_clusters))
#resample_features_by_msm_equilibirum_pop(apo_tica, apo_resampled_traj_to_frames, apo_tica_resampled_file)
#apo_tica_resampled = verboseload(apo_tica_resampled_file)
plot_columns(tica_dir, apo_tica_resampled_file, titles = ["tIC.%d" %i for i in range(1,n_components+1)], main = "mOR_Apo_MSM_", tICA = False, scale = 1.0, refcoords_file = ref_tica_coords, axes=tica_axes, concatenate=False, reshape=False)

In [None]:
import analysis
reload(analysis)
from analysis import *

from operator import itemgetter
apo_clusterer = verboseload(apo_clusterer_file)
apo_msm = verboseload(apo_msm_dir)

cluster_id_proportion_tuples = []

equilibrium_populations = apo_msm.populations_

for cluster_id in apo_msm.mapping_.keys():
    state_id = apo_msm.mapping_[cluster_id]
    cluster_id_proportion_tuples.append((cluster_id,equilibrium_populations[state_id]))
cluster_id_proportion_tuples = sorted(cluster_id_proportion_tuples, key=itemgetter(1), reverse=True)
    
plot_all_tics_and_clusters(tica_dir, apo_tica_resampled_file, apo_clusterer_file, lag_time, tic_range=[0,3], main = "Apo_MSM_Reweighted", label = "cluster_id", active_cluster_ids = [c[0] for c in cluster_id_proportion_tuples[0:100]], intermediate_cluster_ids = [], inactive_cluster_ids = [], concatenate=False, axes=tica_axes)

In [None]:
plot_columns(tica_dir, apo_pnas_resampled_file, titles = sorted(feature_name_residues_dict.keys()), main = "mOR_Apo_MSM_", tICA = False, scale = 1.0, refcoords_file = None, axes=None, concatenate=False, reshape=True)

In [None]:
lag_time=20
prior_counts = 0.0
bu72_msm_dir = os.path.join(tica_dir, "bu72_msm_lag-time%d_prior-counts%s_clusters%d.h5" %(lag_time, str(prior_counts), n_clusters))
#bu72_msm = build_msm(bu72_clusterer_file, lag_time, bu72_msm_dir, prior_counts)


In [None]:
bu72_msm_dir

In [None]:
total_samples = 1000000
num_trajs = len(bu72_trajs)
bu72_resampled_traj_to_frames_file = os.path.join(tica_dir, "bu72_msm_lag-time%d_prior-counts%s_clusters%d_resampled.h5" %(lag_time, str(prior_counts), n_clusters))
bu72_resampled_traj_to_frames = resample_by_msm(total_samples, bu72_msm, bu72_clusters_map, num_trajs, bu72_resampled_traj_to_frames_file)

In [None]:
bu72_resampled_traj_to_frames_file

In [None]:
#bu72_pnas = verboseload(bu72_pnas_file)
bu72_pnas_resampled_file = os.path.join(tica_dir, "bu72_pnas_msm_lag-time%d_prior-counts%s_clusters%d_resampled.h5" %(lag_time, str(prior_counts), n_clusters))
#resample_features_by_msm_equilibirum_pop(bu72_pnas, bu72_resampled_traj_to_frames, bu72_pnas_resampled_file)
bu72_pnas_resampled = verboseload(bu72_pnas_resampled_file)

In [None]:
bu72_pnas_file

In [None]:
analysis_dir

In [None]:
import jointplot_d3
reload(jointplot_d3)
from jointplot_d3 import *
jointplots(bu72_pnas_resampled[::100,:], analysis_dir, titles = sorted(feature_name_residues_dict.keys()), main = "mOR_BU72_MSM_", refcoords_file = None, axes=None, reshape=True)

In [None]:
plot_columns(tica_dir, bu72_pnas_resampled_file, titles = sorted(feature_name_residues_dict.keys()), main = "mOR_BU72_MSM_", tICA = False, scale = 1.0, refcoords_file = None, axes=None, concatenate=False, reshape=True)

In [None]:
#bu72_tica = verboseload(bu72_tica_file)
#bu72_tica_resampled_file = os.path.join(tica_dir, "bu72_tica_msm_lag-time%d_prior-counts%s_clusters%d_resampled.h5" %(lag_time, str(prior_counts), n_clusters))
#resample_features_by_msm_equilibirum_pop(bu72_tica, bu72_resampled_traj_to_frames, bu72_tica_resampled_file)
#bu72_tica_resampled = verboseload(bu72_tica_resampled_file)
plot_columns(tica_dir, bu72_tica_resampled_file, titles = ["tIC.%d" %i for i in range(1,n_components+1)], main = "mOR_BU72_MSM_", tICA = False, scale = 1.0, refcoords_file = ref_tica_coords, axes=tica_axes, concatenate=False, reshape=True)

In [None]:
bu72_clusterer = verboseload(bu72_clusterer_file)
bu72_msm = verboseload(bu72_msm_dir)

cluster_id_proportion_tuples = []

equilibrium_populations = bu72_msm.populations_

for cluster_id in bu72_msm.mapping_.keys():
    state_id = bu72_msm.mapping_[cluster_id]
    cluster_id_proportion_tuples.append((cluster_id,equilibrium_populations[state_id]))
cluster_id_proportion_tuples = sorted(cluster_id_proportion_tuples, key=itemgetter(1), reverse=True)

plot_all_tics_and_clusters(tica_dir, bu72_tica_resampled_file, bu72_clusterer_file, lag_time, tic_range=[0,3], main = "BU72_MSM_Reweighted", label = "cluster_id", active_cluster_ids = [c[0] for c in cluster_id_proportion_tuples[0:100]], intermediate_cluster_ids = [], inactive_cluster_ids = [], concatenate=False, axes=tica_axes)


In [None]:
bu72_tica_resampled_file

In [None]:
feature_name_residues_dict.keys()

In [None]:

np.shape(apo_pnas_resampled)
msm_dir = os.path.join(tica_dir, "msm_lag-time%d_prior-counts%s_clusters%d_analysis" %(lag_time, str(prior_counts), n_clusters))
if not os.path.exists(msm_dir): os.makedirs(msm_dir)

#plot_columns(msm_dir, apo_pnas_resampled_file, titles = feature_name_residues_dict.keys(), main = "Apo-mOR", tICA = False, scale = 1.0, refcoords_file = None, axes=None, concatenate=False)

In [None]:
import pip

def install(package):
    pip.main(['install', package])
    
install("moviepy")

In [None]:
import jointplot_d3
reload(jointplot_d3)
from jointplot_d3 import *

for i, traj in enumerate(get_trajectory_files(traj_dir, traj_ext)):
    if "apo_rep_68." in traj:
        index=i
        break

jointplot(apo_pnas_resampled[::10,[6,3]], "%s/test.pdf" %msm_dir, trajectory=apo_pnas[index][::1,[6,3]], 
          titles=["TM6 to TM3 distance", "RMSD of NPxxY to Inactive"], main = "mOR Activation Pathway", 
          video_file = "%s/apo_68.mp4" %msm_dir, custom_xlim=[4.0, 16.0], custom_ylim= [0.0, 1.0])
#test("%s/test2.mp4" %msm_dir)

In [None]:
build_msm(bu72_clusterer_file, lag_time, msm_model_dir)
build_msm(apo_clusterer_file, lag_time, msm_model_dir)

In [None]:
tics_1_3_file = os.path.join(tica_dir, "tICs_1_3.h5")
tics = load_file(projected_features_dir)
tics_1_3 = [x[:,(0,2)] for x in tics]
verbosedump(tics_1_3, tics_1_3_file)

clusterer16_clusters = os.path.join(tica_dir, "clusterer16_clusters_tICs1_3.h5")
cluster_minikmeans(tica_dir, projected_features_dir, traj_dir, n_clusters=16, clusterer_dir=clusterer16_clusters,tICs=[1,3])

In [None]:
closest_indices = find_closest_indices_to_cluster_center(tics_1_3_file, clusterer16_clusters)

In [None]:
closest_indices

In [None]:
from scipy.stats import gamma 
x = np.linspace(0.,16.0, 100)
y = gamma.pdf(x, 5.0, loc=8.0, scale=1)
plt.plot(x,y)

In [None]:
from scipy.stats import gamma 
x = np.linspace(0.,16.0, 100)
y = gamma.pdf(x, 3.0, loc=8.0, scale=0.4)
plt.plot(x,y)
plt.show()