In [1]:
import pyemma
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd

In [2]:
pdb = './WT_ZAFF.pdb' #Topology PDB
traj = './apo_300K_310K_combine_per100ps_tr1.nc' #All trajecotories at 100ps

In [3]:
distance_feat = pyemma.coordinates.featurizer(pdb)
distance_feat.add_distances_ca(periodic=True)
print("Total number of distance features:", distance_feat.dimension())
parent_pairdist = pd.DataFrame(distance_feat.describe())
distance_feat

Total number of distance features: 18336


MDFeaturizer with features:
['DIST: SER 1 CA 4 0 - SER 4 CA 45 0',
 'DIST: SER 1 CA 4 0 - GLN 5 CA 56 0',
 'DIST: SER 1 CA 4 0 - LYS 6 CA 73 0',
 'DIST: SER 1 CA 4 0 - THR 7 CA 95 0',
 'DIST: SER 1 CA 4 0 - TYR 8 CA 109 0',
 'DIST: SER 1 CA 4 0 - GLN 9 CA 130 0',
 'DIST: SER 1 CA 4 0 - GLY 10 CA 147 0',
 'DIST: SER 1 CA 4 0 - SER 11 CA 154 0',
 'DIST: SER 1 CA 4 0 - TYR 12 CA 165 0',
 'DIST: SER 1 CA 4 0 - GLY 13 CA 186 0', ...]

In [4]:
distance_data1=pyemma.coordinates.load(traj,features=distance_feat)

100%|██████████| 29/29 [00:51<00:00,  1.78s/it]                                 


In [5]:
distance_df1 = pd.DataFrame(distance_data1)
print("Number of frames/rows =",distance_df1.shape[0])
print("Distances/columns",distance_df1.shape[1])

Number of frames/rows = 104775
Distances/columns 18336


In [6]:
print("Total number of columns at the start:", distance_df1.shape[1])

Total number of columns at the start: 18336


In [7]:
df1 = distance_df1[(distance_df1>0.3) & (distance_df1<1)]

In [8]:
df_filter1 = df1.loc[:, (df1.isnull().sum(axis=0) >= distance_df1.shape[0])]
filter_columns = df_filter1.columns
df_filter1.shape[1]
distance_df2 = distance_df1.drop(filter_columns, axis=1)
print("Filter 1: Removing distances lt 3 A and gt 10 A in all frames")
print("Number of columns after applying Filter 1: ", distance_df2.shape[1])

Filter 1: Removing distances lt 3 A and gt 10 A in all frames
Number of columns after applying Filter 1:  3685


In [9]:
distance_df3 = distance_df2.loc[:, (distance_df2.var(axis=0) >= 0.05)]
print("Filter 2: Removing distances with variance lt 0.05")
print("Number of columns after applying Filter 2: ", distance_df3.shape[1])

Filter 2: Removing distances with variance lt 0.05
Number of columns after applying Filter 2:  719


In [10]:
final_filter_pairs = distance_df3.columns
filtered_pairs = parent_pairdist.loc[final_filter_pairs]
filtered_pairs.columns = ['Pairs']
filtered_split = filtered_pairs.Pairs.str.split(expand=True,)
#filtered_split
test1 = filtered_split[(filtered_split.iloc[:,2] !="1") & (filtered_split.iloc[:,8] != "194")]
distance_df4 = distance_df3[test1.index[:]]
print("Filter 3: Removing terminal residue pairs")
print("Number of columns after applying Filter 3: ", distance_df4.shape[1])

Filter 3: Removing terminal residue pairs
Number of columns after applying Filter 3:  623


In [11]:
tdist_data1=distance_df4.to_numpy()
tica1 = pyemma.coordinates.tica(tdist_data1,lag=20)
tica1_output = tica1.get_output()
tica1_concatenated = np.concatenate(tica1_output)
tica1_df = pd.DataFrame(tica1_concatenated)
tica1_nos = tica1_df.shape[1]
print("First tICA performeed on:", distance_df4.shape[1])
print("First cycle no. of tICs", tica1_nos)

First tICA performeed on: 623
First cycle no. of tICs 256


In [12]:
tica1_values = tica1_df.iloc[:,0]
tica2_values = tica1_df.iloc[:,1]
tica3_values = tica1_df.iloc[:,2]
tica4_values = tica1_df.iloc[:,3]
corr1_df = pd.DataFrame()
corr2_df = pd.DataFrame()
corr3_df = pd.DataFrame()
corr4_df = pd.DataFrame()
final_corr =  pd.DataFrame()

In [13]:
for col in distance_df4.columns:
    d1 = distance_df4[col]
    dist_corr1 = np.corrcoef(d1,tica1_values)
    dist_corr2 = np.corrcoef(d1,tica2_values)
    dist_corr3 = np.corrcoef(d1,tica3_values)
    dist_corr4 = np.corrcoef(d1,tica4_values)
    corr1_df[col]=[dist_corr1[0,1]]
    corr2_df[col]=[dist_corr2[0,1]]
    corr3_df[col]=[dist_corr3[0,1]]
    corr4_df[col]=[dist_corr4[0,1]]
    if(dist_corr1[0,1] > 0.4):
        final_corr[col]=[dist_corr1[0,1]]

In [14]:
final_corr.shape
fin_cols = final_corr.columns
distance_df5 = distance_df4[fin_cols]
print("Filter 4: Removing distances showing correlation less than 0.4 with IC 1")
print("Number of columns after applying Filter 4: ", distance_df5.shape[1])

Filter 4: Removing distances showing correlation less than 0.4 with IC 1
Number of columns after applying Filter 4:  106


In [15]:
tdist_data2=distance_df5.to_numpy()
tica2 = pyemma.coordinates.tica(tdist_data2,lag=20)
tica2_output = tica2.get_output()
tica2_concatenated = np.concatenate(tica2_output)
tica2_df = pd.DataFrame(tica2_concatenated)
tica2_nos = tica2_df.shape[1]
print("First tICA performed on:", distance_df5.shape[1])
print("First cycle no. of tICs", tica2_nos)

First tICA performed on: 106
First cycle no. of tICs 70


In [16]:
tica2_values1 = tica2_df.iloc[:,0]
tica2_values2 = tica2_df.iloc[:,1]
tica2_values3 = tica2_df.iloc[:,2]
tica2_values4 = tica2_df.iloc[:,3]
corr1_df2 = pd.DataFrame()
corr2_df2 = pd.DataFrame()
corr3_df2 = pd.DataFrame()
corr4_df2 = pd.DataFrame()
final_corr2 =  pd.DataFrame()

In [17]:
for col1 in distance_df5.columns:
    d2 = distance_df5[col1]
    dist2_corr1 = np.corrcoef(d2,tica2_values1)
    dist2_corr2 = np.corrcoef(d2,tica2_values2)
    dist2_corr3 = np.corrcoef(d2,tica2_values3)
    dist2_corr4 = np.corrcoef(d2,tica2_values4)
    corr1_df2[col1]=[dist2_corr1[0,1]]
    corr2_df2[col1]=[dist2_corr2[0,1]]
    corr3_df2[col1]=[dist2_corr3[0,1]]
    corr4_df2[col1]=[dist2_corr4[0,1]]
    if(dist2_corr1[0,1] > 0.6):
        final_corr2[col1]=[dist2_corr1[0,1]]

In [18]:
final_corr2.shape
fin_cols2 = final_corr2.columns
distance_df6 = distance_df5[fin_cols2]
print("Filter 5: Removing distances showing correlation less than 0.6 with IC 1")
print("Number of columns after applying Filter 5: ", distance_df6.shape[1])

Filter 5: Removing distances showing correlation less than 0.6 with IC 1
Number of columns after applying Filter 5:  59


In [19]:
tdist_data3=distance_df6.to_numpy()
tica3 = pyemma.coordinates.tica(tdist_data3,lag=20)
tica3_output = tica3.get_output()
tica3_concatenated = np.concatenate(tica3_output)
tica3_df = pd.DataFrame(tica3_concatenated)
tica3_nos = tica3_df.shape[1]
print("Second tICA performed on:", distance_df6.shape[1])
print("Second cycle no. of tICs", tica3_nos)

Second tICA performed on: 59
Second cycle no. of tICs 42


In [20]:
tica3_values1 = tica3_df.iloc[:,0]
tica3_values2 = tica3_df.iloc[:,1]
tica3_values3 = tica3_df.iloc[:,2]
tica3_values4 = tica3_df.iloc[:,3]
corr1_df3 = pd.DataFrame()
corr2_df3 = pd.DataFrame()
corr3_df3 = pd.DataFrame()
corr4_df3 = pd.DataFrame()
final_corr3 =  pd.DataFrame()

In [21]:
for col2 in distance_df6.columns:
    d3 = distance_df6[col2]
    dist3_corr1 = np.corrcoef(d3,tica3_values1)
    dist3_corr2 = np.corrcoef(d3,tica3_values2)
    dist3_corr3 = np.corrcoef(d3,tica3_values3)
    dist3_corr4 = np.corrcoef(d3,tica3_values4)
    corr1_df3[col2]=[dist3_corr1[0,1]]
    corr2_df3[col2]=[dist3_corr2[0,1]]
    corr3_df3[col2]=[dist3_corr3[0,1]]
    corr4_df3[col2]=[dist3_corr4[0,1]]
    if(dist3_corr1[0,1] > 0.7):
        final_corr3[col2]=[dist3_corr1[0,1]]

In [22]:
final_corr3.shape
fin_cols3 = final_corr3.columns
distance_df7 = distance_df6[fin_cols3]
print("Filter 6: Removing distances showing correlation less than 0.7 with IC 1")
print("Number of columns after applying Filter 6: ", distance_df7.shape[1])

Filter 6: Removing distances showing correlation less than 0.7 with IC 1
Number of columns after applying Filter 6:  29


In [34]:
tdist_data4=distance_df7.to_numpy()
tica4 = pyemma.coordinates.tica(tdist_data4,lag=20)
tica4_output = tica4.get_output()
tica4_concatenated = np.concatenate(tica4_output)
tica4_df = pd.DataFrame(tica4_concatenated)
tica4_nos = tica4_df.shape[1]
print("Third tICA performed on:", distance_df7.shape[1])
print("Third cycle no. of tICs", tica4_nos)

Third tICA performed on: 29
Third cycle no. of tICs 23


In [24]:
tica4_values1 = tica4_df.iloc[:,0]
tica4_values2 = tica4_df.iloc[:,1]
tica4_values3 = tica4_df.iloc[:,2]
tica4_values4 = tica4_df.iloc[:,3]
corr1_df4 = pd.DataFrame()
corr2_df4 = pd.DataFrame()
corr3_df4 = pd.DataFrame()
corr4_df4 = pd.DataFrame()
final_corr4 =  pd.DataFrame()

In [25]:
for col3 in distance_df7.columns:
    d4 = distance_df7[col3]
    dist4_corr1 = np.corrcoef(d4,tica4_values1)
    dist4_corr2 = np.corrcoef(d4,tica4_values2)
    dist4_corr3 = np.corrcoef(d4,tica4_values3)
    dist4_corr4 = np.corrcoef(d4,tica4_values4)
    corr1_df4[col3]=[dist4_corr1[0,1]]
    corr2_df4[col3]=[dist4_corr2[0,1]]
    corr3_df4[col3]=[dist4_corr3[0,1]]
    corr4_df4[col3]=[dist4_corr4[0,1]]
    if(dist4_corr1[0,1] > 0.8):
        final_corr4[col3]=[dist4_corr1[0,1]]

In [26]:
final_corr4.shape
fin_cols4 = final_corr4.columns
distance_df8 = distance_df7[fin_cols4]
print("Filter 7: Removing distances showing correlation less than 0.8 with IC 1")
print("Number of columns after applying Filter 7: ", distance_df8.shape[1])

Filter 7: Removing distances showing correlation less than 0.8 with IC 1
Number of columns after applying Filter 7:  16


In [33]:
tdist_data5=distance_df8.to_numpy()
tica5 = pyemma.coordinates.tica(tdist_data5,lag=20)
tica5_output = tica5.get_output()
tica5_concatenated = np.concatenate(tica5_output)
tica5_df = pd.DataFrame(tica5_concatenated)
tica5_nos = tica5_df.shape[1]
print("Fourth tICA performed on:", distance_df8.shape[1])
print("Fourth cycle no. of tICs", tica5_nos)

Fourth tICA performed on: 16
Fourth cycle no. of tICs 14


In [28]:
tica5_values1 = tica5_df.iloc[:,0]
tica5_values2 = tica5_df.iloc[:,1]
tica5_values3 = tica5_df.iloc[:,2]
tica5_values4 = tica5_df.iloc[:,3]
corr1_df5 = pd.DataFrame()
corr2_df5 = pd.DataFrame()
corr3_df5 = pd.DataFrame()
corr4_df5 = pd.DataFrame()
final_corr5 =  pd.DataFrame()

In [29]:
for col4 in distance_df8.columns:
    d5 = distance_df8[col4]
    dist5_corr1 = np.corrcoef(d4,tica4_values1)
    dist5_corr2 = np.corrcoef(d4,tica4_values2)
    dist5_corr3 = np.corrcoef(d4,tica4_values3)
    dist5_corr4 = np.corrcoef(d4,tica4_values4)
    corr1_df5[col4]=[dist5_corr1[0,1]]
    corr2_df5[col4]=[dist5_corr2[0,1]]
    corr3_df5[col4]=[dist5_corr3[0,1]]
    corr4_df5[col4]=[dist5_corr4[0,1]]
    if(dist5_corr1[0,1] > 0.9):
        final_corr5[col4]=[dist5_corr1[0,1]]

In [30]:
final_corr5.shape
fin_cols5 = final_corr5.columns
distance_df9 = distance_df8[fin_cols5]
print("Filter 8: Removing distances showing correlation less than 0.9 with IC 1")
print("Number of columns after applying Filter 8: ", distance_df9.shape[1])

Filter 8: Removing distances showing correlation less than 0.9 with IC 1
Number of columns after applying Filter 8:  0


In [31]:
filtered1_pairs2 = parent_pairdist.loc[distance_df8.columns]
filtered1_pairs2.to_csv('Apo_pairlist_16.csv', index=False)