In [39]:
import pandas as pd
import numpy as np

#import rdkit
#from rdkit import Chem

import biopandas
from biopandas.pdb import PandasPdb
from biopandas.mol2 import PandasMol2

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import sys
from collections import Counter

In [40]:
BIM_ATOM_COUNT = 25
T2N_ATOM_COUNT = 15
T5P_ATOM_COUNT = 3
ANION_MOLECULE = "T2N"
CATION_MOLECULE = "BMI"
WATER_MOLECULE = "T5P"
SODIUM_ATOM = "NA"
CHLORIDE_ATOM = "CL"

In [41]:
INPUT_FILE = "./FILE/BOX_Bmim_Tf2N_0_5M_all_components_amorphous_-_Frame_51.mol2"
INPUT_FILE_PATH = "./FILE/BOX_Bmim_Tf2N_0_5M_all_components_amorphous_-_Frame_51.mol2"

In [42]:
cnt = 0
find_word = "@<TRIPOS>BOND"
with open(INPUT_FILE, 'r') as file:    # hello.txt 파일을 읽기 모드(r)로 열기
    line = None    # 변수 line을 None으로 초기화
    while line != '':
        line = file.readline()
        cnt += 1
        if "@<TRIPOS>BOND" in line:
            FN = cnt
            
        if "@<TRIPOS>SUBSTRUCTURE" in line:
            EN = cnt
#===============================================================================           
f = open(INPUT_FILE_PATH, 'r')
F = f.readlines()[FN:EN-1]
Bond_list = []
for i in range(EN-FN-1):
    DATA = F[int(i)].strip().split()
    Bond_list.append(DATA)
#===============================================================================  
column_name=['Bond_Index', 'Bond_Atom1', 'Bond_Atom2', 'Bond_Case']
Bond_Data_Frame = pd.DataFrame(Bond_list, columns = column_name)
#===============================================================================
Bond_Data_Frame = Bond_Data_Frame.astype({'Bond_Atom1': int, 'Bond_Atom2': int})

In [43]:
pmol = PandasMol2().read_mol2(INPUT_FILE_PATH)  ### input
Total_system = pmol.df

In [44]:
Total_system

Unnamed: 0,atom_id,atom_name,x,y,z,atom_type,subst_id,subst_name,charge
0,1,C,21.7125,12.9756,-21.7713,C.3,1,BMI,-0.3305
1,2,N,22.9275,12.3355,-21.4824,N.pl3,1,BMI,0.1939
2,3,C,22.9824,11.0176,-21.2151,C.ar,1,BMI,-0.1044
3,4,N,24.2717,10.6997,-20.8984,N.pl3,1,BMI,0.2896
4,5,C,25.0390,11.8696,-20.9131,C.ar,1,BMI,-0.3225
...,...,...,...,...,...,...,...,...,...
42655,42656,Cl42656,-16.8365,6.9614,-18.4800,Cl,5,CL,-1.0000
42656,42657,Cl42657,-17.6616,-34.5102,11.8603,Cl,5,CL,-1.0000
42657,42658,Cl42658,-15.5498,9.7965,36.1147,Cl,5,CL,-1.0000
42658,42659,Cl42659,-36.4381,23.8670,-10.8523,Cl,5,CL,-1.0000


In [45]:
condition = (pmol.df.subst_name == CATION_MOLECULE) 
Data_Selecte_subst_CATION = pmol.df[condition]
Data_Selecte_subst_CATION_index = pmol.df[condition].index

condition = (pmol.df.subst_name == ANION_MOLECULE) 
Data_Selecte_subst_ANION = pmol.df[condition]
Data_Selecte_subst_ANION_index = pmol.df[condition].index

condition = (pmol.df.subst_name == WATER_MOLECULE) 
Data_Selecte_subst_WATER = pmol.df[condition]
Data_Selecte_subst_WATER_index = pmol.df[condition].index

condition = (pmol.df.subst_name == SODIUM_ATOM) 
Data_Selecte_subst_NA = pmol.df[condition]
Data_Selecte_subst_NA_index = pmol.df[condition].index


condition = (pmol.df.subst_name == CHLORIDE_ATOM) 
Data_Selecte_subst_CL = pmol.df[condition]
Data_Selecte_subst_CL_index = pmol.df[condition].index

In [46]:
Total_system

Unnamed: 0,atom_id,atom_name,x,y,z,atom_type,subst_id,subst_name,charge
0,1,C,21.7125,12.9756,-21.7713,C.3,1,BMI,-0.3305
1,2,N,22.9275,12.3355,-21.4824,N.pl3,1,BMI,0.1939
2,3,C,22.9824,11.0176,-21.2151,C.ar,1,BMI,-0.1044
3,4,N,24.2717,10.6997,-20.8984,N.pl3,1,BMI,0.2896
4,5,C,25.0390,11.8696,-20.9131,C.ar,1,BMI,-0.3225
...,...,...,...,...,...,...,...,...,...
42655,42656,Cl42656,-16.8365,6.9614,-18.4800,Cl,5,CL,-1.0000
42656,42657,Cl42657,-17.6616,-34.5102,11.8603,Cl,5,CL,-1.0000
42657,42658,Cl42658,-15.5498,9.7965,36.1147,Cl,5,CL,-1.0000
42658,42659,Cl42659,-36.4381,23.8670,-10.8523,Cl,5,CL,-1.0000


In [47]:
CATION_Residue_LIST = []


for i in range(1, int(len(Data_Selecte_subst_CATION_index)/25)+1):
    ATOM_COUNT = 0  ## while 반복문 초기화
    while ATOM_COUNT < 25:    ## 수치 적정 변수로 생각하기
        CATION_Residue_LIST.append("CATION_Res_"+str(i))
        ATOM_COUNT = ATOM_COUNT+1

In [48]:
len(CATION_Residue_LIST)

9675

In [49]:
ANION_Residue_LIST = []


for i in range(1, int(len(Data_Selecte_subst_ANION_index)/15)+1):
    ATOM_COUNT = 0  ## while 반복문 초기화
    while ATOM_COUNT < 15:    ## 수치 적정 변수로 생각하기
        ANION_Residue_LIST.append("ANION_Res_"+str(i))
        ATOM_COUNT = ATOM_COUNT+1

In [50]:
len(ANION_Residue_LIST)

5805

In [51]:
Data_Selecte_subst_ANION_index

Int64Index([ 9675,  9676,  9677,  9678,  9679,  9680,  9681,  9682,  9683,
             9684,
            ...
            15470, 15471, 15472, 15473, 15474, 15475, 15476, 15477, 15478,
            15479],
           dtype='int64', length=5805)

In [52]:
WATER_Residue_LIST = []


for i in range(1, int(len(Data_Selecte_subst_WATER_index)/3)+1):
    ATOM_COUNT = 0  ## while 반복문 초기화
    while ATOM_COUNT < 3:    ## 수치 적정 변수로 생각하기
        WATER_Residue_LIST.append("WATER_Res_"+str(i))
        ATOM_COUNT = ATOM_COUNT+1

In [53]:
len(WATER_Residue_LIST)

27000

In [54]:
NA_Residue_LIST = []


for i in range(1, int(len(Data_Selecte_subst_NA_index)/1)+1):
    ATOM_COUNT = 0  ## while 반복문 초기화
    while ATOM_COUNT < 1:    ## 수치 적정 변수로 생각하기
        NA_Residue_LIST.append("NA_Res_"+str(i))
        ATOM_COUNT = ATOM_COUNT+1

In [55]:
len(NA_Residue_LIST)

90

In [56]:
CL_Residue_LIST = []


for i in range(1, int(len(Data_Selecte_subst_CL_index)/1)+1):
    ATOM_COUNT = 0  ## while 반복문 초기화
    while ATOM_COUNT < 1:    ## 수치 적정 변수로 생각하기
        CL_Residue_LIST.append("CL_Res_"+str(i))
        ATOM_COUNT = ATOM_COUNT+1

In [57]:
len(CL_Residue_LIST)

90

In [58]:
SYSTEM_Reside_LIST = CATION_Residue_LIST + ANION_Residue_LIST + WATER_Residue_LIST + NA_Residue_LIST + CL_Residue_LIST

In [59]:
len(SYSTEM_Reside_LIST)

42660

In [60]:
SYSTEM_Reside_DF = pd.DataFrame(SYSTEM_Reside_LIST)
SYSTEM_Reside_DF.columns=["Residue_number"]

In [61]:
Total_system_SYSTEM_Reside_DF = pd.concat([Total_system,SYSTEM_Reside_DF],axis=1)

In [62]:
Total_system_SYSTEM_Reside_DF[Total_system_SYSTEM_Reside_DF['subst_name'] == CATION_MOLECULE]

Unnamed: 0,atom_id,atom_name,x,y,z,atom_type,subst_id,subst_name,charge,Residue_number
0,1,C,21.7125,12.9756,-21.7713,C.3,1,BMI,-0.3305,CATION_Res_1
1,2,N,22.9275,12.3355,-21.4824,N.pl3,1,BMI,0.1939,CATION_Res_1
2,3,C,22.9824,11.0176,-21.2151,C.ar,1,BMI,-0.1044,CATION_Res_1
3,4,N,24.2717,10.6997,-20.8984,N.pl3,1,BMI,0.2896,CATION_Res_1
4,5,C,25.0390,11.8696,-20.9131,C.ar,1,BMI,-0.3225,CATION_Res_1
...,...,...,...,...,...,...,...,...,...,...
9670,9671,H,22.3638,-11.8768,4.0557,H,1,BMI,0.0914,CATION_Res_387
9671,9672,H,22.3671,-13.5961,4.2719,H,1,BMI,0.0882,CATION_Res_387
9672,9673,HXT,24.2591,-11.2821,9.7294,H,1,BMI,0.2511,CATION_Res_387
9673,9674,HXT,27.8447,-11.9993,7.5262,H,1,BMI,0.2562,CATION_Res_387


In [63]:
### 1. 특정 ATOM을 선택하면 해당 index를 뽑고 해당 Residue_number를 추출한다.
### 2. 해당 Residue_number에 해당하는 분자를 찾는다.

## Define Cluster

#### step 1. 선택한 molecule의 인근 (3A) ATOM 선택하기

In [25]:
condition = (pmol.df.subst_name == ANION_MOLECULE)
Data_Selecte_subst = pmol.df[condition]
Data_Selecte_subst_index = pmol.df[condition].index

In [26]:
Data_Selecte_subst_index

Int64Index([ 9675,  9676,  9677,  9678,  9679,  9680,  9681,  9682,  9683,
             9684,
            ...
            15470, 15471, 15472, 15473, 15474, 15475, 15476, 15477, 15478,
            15479],
           dtype='int64', length=5805)

In [27]:
#pmol = PandasMol2().read_mol2(INPUT_FILE_PATH)  ### input
#Total_system = pmol.df
#condition = (pmol.df.subst_name == ANION_MOLECULE) 
#Data_Selecte_subst = pmol.df[condition]
#Data_Selecte_subst_index = pmol.df[condition].index
#Data_Selecte_subst
#Data_Selecte_subst_index
## 값 setting


find_range = 3    ### input
list_set = []

for i in range(0, len(Data_Selecte_subst_index)):
    select_dataframe = None  ## 선택한 데이터 프레임 초기화
    select_dataframe = Data_Selecte_subst.iloc[i]


    sq1_range = (select_dataframe[['x','y','z']]+find_range)
    sq3_range = (select_dataframe[['x','y','z']]-find_range)

    selected_dataframe = Total_system_SYSTEM_Reside_DF[(Total_system_SYSTEM_Reside_DF['x'] <= sq1_range["x"]) &
                                                       (Total_system_SYSTEM_Reside_DF['x'] >= sq3_range["x"]) &
                                                       (Total_system_SYSTEM_Reside_DF['y'] <= sq1_range["y"]) &
                                                       (Total_system_SYSTEM_Reside_DF['y'] >= sq3_range["y"]) &
                                                       (Total_system_SYSTEM_Reside_DF['z'] <= sq1_range["z"]) &
                                                       (Total_system_SYSTEM_Reside_DF['z'] >= sq3_range["z"])]
                                      

    cal_x = (float(select_dataframe["x"]) - selected_dataframe["x"]).pow(2)
    cal_y = (float(select_dataframe["y"]) - selected_dataframe["y"]).pow(2)
    cal_z = (float(select_dataframe["z"]) - selected_dataframe["z"]).pow(2)
    Distance = (cal_x+cal_y+cal_z)**0.5
    
    #Distance

    Distance_index = Distance[(Distance <= find_range)].index
    Select_Atom_List = Distance_index.values.tolist()
    
    ##  선택한 원자 선택
    
    # list_set
    # list_set = [] 위쪽에 빈공간 있음
    for i in range(0,len(Select_Atom_List)+1):
        list_set.extend(Select_Atom_List)
        list_set = set(list_set)  
        list_set = list(list_set)
        
#list_set_ATOM_numbering = [i+1 for i in list_set]  ## 해당 값은 index기 때문에 ATOM을 선택하려면 1식 더해줘야함

#### step 2. 선택한 molecule에서 Cation Residue만 선택하기

In [43]:
### list set은 인근 잔기 선택한 index
### Residue_number ## 특정 잔기 데이터 추출
### DataFrame을 그대로 사용하면 모든 값마다 list를 가져 데이터 처리가 불편함 따라서 squeeze를 사용해 DataFrame 형태를 Series로 변경  

##df.drop_duplicateds(subset=None, keep='first',inplace=False)
#subset : 중복을 처리할 대상이 되는 열
#keep 
#####- first: 해당값을 발견한 첫번째 값을 제외하고 중복항목삭제
#####- last: 해당값을 마지막으로 발견한 데이터를 제외하고 중복제이터를 삭제
#####- False: 모든 중복 데이터를 삭제

#inplace: 해당 제거된 데이터를 원본데이터로 변경할지 True/False로 결정


Total_system_SYSTEM_Reside_DF.loc[list_set][["Residue_number"]].drop_duplicates().squeeze().to_list()

['CATION_Res_1',
 'CATION_Res_2',
 'WATER_Res_5776',
 'CATION_Res_3',
 'CATION_Res_4',
 'CATION_Res_5',
 'WATER_Res_5799',
 'WATER_Res_5800',
 'CATION_Res_6',
 'WATER_Res_5809',
 'WATER_Res_5815',
 'CATION_Res_7',
 'CATION_Res_8',
 'WATER_Res_5828',
 'WATER_Res_5830',
 'CATION_Res_9',
 'WATER_Res_5836',
 'CATION_Res_10',
 'CATION_Res_11',
 'WATER_Res_5846',
 'WATER_Res_5850',
 'WATER_Res_5854',
 'CATION_Res_12',
 'WATER_Res_5864',
 'CATION_Res_13',
 'CATION_Res_14',
 'CATION_Res_15',
 'WATER_Res_5881',
 'WATER_Res_5886',
 'WATER_Res_5891',
 'CATION_Res_16',
 'CATION_Res_17',
 'WATER_Res_5901',
 'CATION_Res_18',
 'WATER_Res_5910',
 'CATION_Res_19',
 'WATER_Res_5915',
 'WATER_Res_5922',
 'CATION_Res_20',
 'CATION_Res_21',
 'WATER_Res_5937',
 'CATION_Res_22',
 'WATER_Res_5944',
 'WATER_Res_5946',
 'WATER_Res_5947',
 'WATER_Res_5949',
 'CATION_Res_23',
 'CATION_Res_24',
 'CATION_Res_25',
 'CATION_Res_26',
 'WATER_Res_5980',
 'CATION_Res_27',
 'WATER_Res_5984',
 'CATION_Res_28',
 'CATION_Re

In [29]:
Select_Residue_List = Total_system_SYSTEM_Reside_DF.loc[list_set][["Residue_number"]].drop_duplicates().squeeze().to_list()

In [30]:
# 포함하고자 하는 문자열 리스트 생성
Select_Residue_List = Total_system_SYSTEM_Reside_DF.loc[list_set][["Residue_number"]].drop_duplicates().squeeze().to_list()


# join함수를 이용하여 이어주고 contains 함수에 넣기
Select_Residue_List_join = '|'.join(Select_Residue_List)
result = Total_system_SYSTEM_Reside_DF[Total_system_SYSTEM_Reside_DF['Residue_number'].str.contains(Select_Residue_List_join)]

In [31]:
Total_system_SYSTEM_Reside_DF[Total_system_SYSTEM_Reside_DF['subst_name']== CATION_MOLECULE]

Unnamed: 0,atom_id,atom_name,x,y,z,atom_type,subst_id,subst_name,charge,Residue_number
0,1,C,21.7125,12.9756,-21.7713,C.3,1,BMI,-0.3305,CATION_Res_1
1,2,N,22.9275,12.3355,-21.4824,N.pl3,1,BMI,0.1939,CATION_Res_1
2,3,C,22.9824,11.0176,-21.2151,C.ar,1,BMI,-0.1044,CATION_Res_1
3,4,N,24.2717,10.6997,-20.8984,N.pl3,1,BMI,0.2896,CATION_Res_1
4,5,C,25.0390,11.8696,-20.9131,C.ar,1,BMI,-0.3225,CATION_Res_1
...,...,...,...,...,...,...,...,...,...,...
9670,9671,H,22.3638,-11.8768,4.0557,H,1,BMI,0.0914,CATION_Res_387
9671,9672,H,22.3671,-13.5961,4.2719,H,1,BMI,0.0882,CATION_Res_387
9672,9673,HXT,24.2591,-11.2821,9.7294,H,1,BMI,0.2511,CATION_Res_387
9673,9674,HXT,27.8447,-11.9993,7.5262,H,1,BMI,0.2562,CATION_Res_387


In [32]:
result[result['subst_name'] == CATION_MOLECULE]

Unnamed: 0,atom_id,atom_name,x,y,z,atom_type,subst_id,subst_name,charge,Residue_number
0,1,C,21.7125,12.9756,-21.7713,C.3,1,BMI,-0.3305,CATION_Res_1
1,2,N,22.9275,12.3355,-21.4824,N.pl3,1,BMI,0.1939,CATION_Res_1
2,3,C,22.9824,11.0176,-21.2151,C.ar,1,BMI,-0.1044,CATION_Res_1
3,4,N,24.2717,10.6997,-20.8984,N.pl3,1,BMI,0.2896,CATION_Res_1
4,5,C,25.0390,11.8696,-20.9131,C.ar,1,BMI,-0.3225,CATION_Res_1
...,...,...,...,...,...,...,...,...,...,...
9670,9671,H,22.3638,-11.8768,4.0557,H,1,BMI,0.0914,CATION_Res_387
9671,9672,H,22.3671,-13.5961,4.2719,H,1,BMI,0.0882,CATION_Res_387
9672,9673,HXT,24.2591,-11.2821,9.7294,H,1,BMI,0.2511,CATION_Res_387
9673,9674,HXT,27.8447,-11.9993,7.5262,H,1,BMI,0.2562,CATION_Res_387


#### step 3. 선택한 molecule의 인근 (3A) ATOM 선택하기

In [33]:
Data_Selecte_subst = result[result['subst_name'] == CATION_MOLECULE]

In [34]:
Data_Selecte_subst_index = result[result['subst_name'] == CATION_MOLECULE].index

In [35]:

find_range = 3    ### input
list_set = []

for i in range(0, len(Data_Selecte_subst_index)):
    select_dataframe = None  ## 선택한 데이터 프레임 초기화
    select_dataframe = Data_Selecte_subst.iloc[i]


    sq1_range = (select_dataframe[['x','y','z']]+find_range)
    sq3_range = (select_dataframe[['x','y','z']]-find_range)

    selected_dataframe = Total_system[(Total_system['x'] <= sq1_range["x"]) &
                                      (Total_system['x'] >= sq3_range["x"]) &
                                      (Total_system['y'] <= sq1_range["y"]) &
                                      (Total_system['y'] >= sq3_range["y"]) &
                                      (Total_system['z'] <= sq1_range["z"]) &
                                      (Total_system['z'] >= sq3_range["z"])]
                                      

    cal_x = (float(select_dataframe["x"]) - selected_dataframe["x"]).pow(2)
    cal_y = (float(select_dataframe["y"]) - selected_dataframe["y"]).pow(2)
    cal_z = (float(select_dataframe["z"]) - selected_dataframe["z"]).pow(2)
    Distance = (cal_x+cal_y+cal_z)**0.5
    
    #Distance

    Distance_index = Distance[(Distance <= find_range)].index
    Select_Atom_List=Distance_index.values.tolist()
    
    ##  선택한 원자 선택
    
    # list_set
    # list_set = [] 위쪽에 빈공간 있음
    for i in range(0,len(Select_Atom_List)+1):
        list_set.extend(Select_Atom_List)
        list_set = set(list_set)  
        list_set = list(list_set)


In [36]:
# 포함하고자 하는 문자열 리스트 생성
Select_Residue_List=Total_system_SYSTEM_Reside_DF.loc[list_set][["Residue_number"]].drop_duplicates().squeeze().to_list()


# join함수를 이용하여 이어주고 contains 함수에 넣기
Select_Residue_List_join = '|'.join(Select_Residue_List)
result = Total_system_SYSTEM_Reside_DF[Total_system_SYSTEM_Reside_DF['Residue_number'].str.contains(Select_Residue_List_join)]

#### step 4. IL Cluster Define

In [37]:
result[(result['subst_name'] == CATION_MOLECULE)|(result['subst_name'] == ANION_MOLECULE)]

Unnamed: 0,atom_id,atom_name,x,y,z,atom_type,subst_id,subst_name,charge,Residue_number
0,1,C,21.7125,12.9756,-21.7713,C.3,1,BMI,-0.3305,CATION_Res_1
1,2,N,22.9275,12.3355,-21.4824,N.pl3,1,BMI,0.1939,CATION_Res_1
2,3,C,22.9824,11.0176,-21.2151,C.ar,1,BMI,-0.1044,CATION_Res_1
3,4,N,24.2717,10.6997,-20.8984,N.pl3,1,BMI,0.2896,CATION_Res_1
4,5,C,25.0390,11.8696,-20.9131,C.ar,1,BMI,-0.3225,CATION_Res_1
...,...,...,...,...,...,...,...,...,...,...
15475,15476,F,18.5276,30.0993,-18.6566,F,2,T2N,-0.1518,ANION_Res_387
15476,15477,O,17.4059,33.6747,-17.9482,O.2,2,T2N,-0.6559,ANION_Res_387
15477,15478,O,16.6167,31.5197,-16.9102,O.2,2,T2N,-0.6559,ANION_Res_387
15478,15479,O,15.8857,34.0225,-20.4904,O.2,2,T2N,-0.6559,ANION_Res_387


In [104]:
## 변수 생성

for i in range(1,int(len(Data_Selecte_subst_CATION_index)/15)+1):
    globals()['CATION_Var_{}'.format(i)] = []

In [119]:
## 변수 리스트 제작

CATION_Variable_LIST = []

for i in range(1, int(len(Data_Selecte_subst_CATION_index)/15)+1):
    CATION_Variable_LIST.append("CATION_Var_"+str(i))

In [128]:
Data_Selecte_subst_CATION_index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            9665, 9666, 9667, 9668, 9669, 9670, 9671, 9672, 9673, 9674],
           dtype='int64', length=9675)

In [None]:



while_count = 0
while while_count < 15:
    
    
    
    while_count = while_count + 1
    
    
    
    

In [131]:
Data_Selecte_subst_CATION_index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            9665, 9666, 9667, 9668, 9669, 9670, 9671, 9672, 9673, 9674],
           dtype='int64', length=9675)

In [61]:
Data_Selecte_subst_ANION_index

Int64Index([ 9675,  9676,  9677,  9678,  9679,  9680,  9681,  9682,  9683,
             9684,
            ...
            15470, 15471, 15472, 15473, 15474, 15475, 15476, 15477, 15478,
            15479],
           dtype='int64', length=5805)

In [72]:
len(Data_Selecte_subst_ANION_index)/15

387.0

In [63]:
Data_Selecte_subst_WATER_index

Int64Index([15480, 15481, 15482, 15483, 15484, 15485, 15486, 15487, 15488,
            15489,
            ...
            42470, 42471, 42472, 42473, 42474, 42475, 42476, 42477, 42478,
            42479],
           dtype='int64', length=27000)

In [68]:
len(Data_Selecte_subst_WATER_index)/3

9000.0