# Импорт

События в блоке кода:

- Импорт необходимых модулей
- Загрузка необработанных датасетов

In [1]:
import re

import pandas as pd

from rdkit import Chem, DataStructs
from rdkit.Chem import Draw, Descriptors, AllChem, rdMolDescriptors
import rdkit.Chem.AllChem as AllChem

In [2]:
reaxys = pd.read_csv('reaxys.csv', sep='\t')
chembl = pd.read_csv('chembl.csv', sep='\t')

# Мишень

События в блоке кода:

- Отбор только тех строк, которые содержат данные исключительно по СDK7 (вне зависимости от организма)

In [3]:
chembl_target = chembl[chembl['Target Name'] == 'Cyclin-dependent kinase 7']

reaxys_options = ['Cyclin-dependent kinase 7 [human]', 'Cyclin-dependent kinase 7',
                  'Cyclin-dependent kinase 7 [Candida albicans]']
reaxys_target = reaxys.loc[reaxys['Target Name'].isin(reaxys_options)]

# Параметр

События в блоке кода:

- Отбор измеренных для соединения параметров

Что это за параметры?

- **IC50** - концентрация полумаксимального ингибирования (чем ниже, тем активнее)
- **EC50** - концентрацию лиганда, которая вызывает эффект, равный половине максимального возможного для данного лиганда (чем ниже, тем активнее)
- **pIC50** - отрицательный логарифм IC50

In [4]:
chembl_target['Standard Type'].unique()

array(['IC50', 'Inhibition', 'Kd', 'Ki', 'Activity', 'Occ50', 'Ratio',
       '% Control', '% Ctrl'], dtype=object)

In [5]:
reaxys_target['Medchem: Measurement Parameter'].unique()

array(['IC50', 'inhibition rate', 'pIC50', 'Ki (inhibition constant)',
       'inhibition percentage', 'Kd (dissociation constant)',
       'control percentage', 'EC50'], dtype=object)

In [6]:
chembl_type = chembl_target[chembl_target['Standard Type'] == 'IC50']

In [7]:
reaxys_IC50 = reaxys_target[reaxys_target['Medchem: Measurement Parameter'] == 'IC50']
reaxys_EC50 = reaxys_target[reaxys_target['Medchem: Measurement Parameter'] == 'EC50']
reaxys_pIC50 = reaxys_target[reaxys_target['Medchem: Measurement Parameter'] == 'pIC50']
reaxys_type = pd.concat([reaxys_IC50, reaxys_EC50, reaxys_pIC50], axis=0)

# Единицы

События в блоке кода:

- Конвертация концентраций в наномолярную (по возможности)
- Трансформация pIC50 в IC50

In [8]:
chembl_type['Standard Units'].value_counts()

nM    294
Name: Standard Units, dtype: int64

In [9]:
reaxys_type['Unit'].value_counts()

nM         1064
μM          527
no unit     187
µM           15
μmol/l        6
M             1
Name: Unit, dtype: int64

In [10]:
chembl_type_nan = chembl_type[chembl_type['Standard Units'].notna()]
reaxys_type_nan = reaxys_type[reaxys_type['Unit'].notna()]

In [11]:
lower_vals = []
upper_vals = []

for value in reaxys_type_nan['Quantitative value'].values:
    value_listed = list(value.split())
    if len(value_listed) > 1:
        if value_listed[0] < value_listed[2]:
            lower_val = value_listed[0]
            upper_val = value_listed[2]
            lower_vals.append(lower_val)
            upper_vals.append(upper_val)
        elif value_listed[0] > value_listed[2]:
            lower_val = value_listed[2]
            upper_val = value_listed[0]
            lower_vals.append(lower_val)
            upper_vals.append(upper_val)
    else:
        lower_vals.append(value_listed)
        upper_vals.append(value_listed)

In [12]:
lower_vals = [float(re.sub("[^0-9.]", "", str(i))) for i in lower_vals]
upper_vals = [float(re.sub("[^0-9.]", "", str(i))) for i in upper_vals]

In [13]:
reaxys_type_nan['lower_vals'] = lower_vals
reaxys_type_nan['upper_vals'] = upper_vals

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reaxys_type_nan['lower_vals'] = lower_vals
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reaxys_type_nan['upper_vals'] = upper_vals


In [14]:
def transform(x):
    if x[0] in ['nM', 'nmol/l']:
        return x[1]
    elif x[0] in ['µM', 'μM', 'μmol/l']:
        return round(x[1] * 10**3, 1)
    elif x[0] == 'mM':
        return x[1] * 10**6
    elif x[0] == 'pM':
        return x[1] / 10**3
    elif x[0] in ['M', 'mol/l']:
        return x[1] * 10**9
    else:
        return None

In [15]:
chembl_type_nan['converted_value'] = chembl_type_nan[['Standard Units', 'Standard Value']].apply(lambda x: transform(x), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chembl_type_nan['converted_value'] = chembl_type_nan[['Standard Units', 'Standard Value']].apply(lambda x: transform(x), axis = 1)


In [16]:
reaxys_type_nan['converted_value'] = reaxys_type_nan[['Unit', 'upper_vals']].apply(lambda x: transform(x), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reaxys_type_nan['converted_value'] = reaxys_type_nan[['Unit', 'upper_vals']].apply(lambda x: transform(x), axis = 1)


In [17]:
reaxys_pIC50 = reaxys_type_nan[reaxys_type_nan['Medchem: Measurement Parameter'] == 'pIC50']

In [18]:
IC50_vals = []

for value in reaxys_pIC50['upper_vals'].values:
    IC50 = 10**(9 - float(value))
    IC50_vals.append(IC50)
    
reaxys_pIC50['converted_value'] = IC50_vals

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reaxys_pIC50['converted_value'] = IC50_vals


In [19]:
reaxys_wout_pIC50 = reaxys_type_nan[reaxys_type_nan['Medchem: Measurement Parameter'] != 'pIC50']

In [20]:
reaxys_units_concat = pd.concat([reaxys_wout_pIC50, reaxys_pIC50], axis=0)

In [21]:
reaxys = reaxys_units_concat[['SMILES', 'converted_value']]

In [22]:
chembl = chembl_type_nan[['Smiles', 'converted_value']]

In [23]:
reaxys = reaxys.rename({'SMILES': 'smiles', 
                        'converted_value': 'value'}, axis=1)

chembl = chembl.rename({'Smiles': 'smiles', 
                        'converted_value': 'value'}, axis=1)

In [24]:
data = pd.concat([reaxys, chembl], axis=0)

# Классы

События в блоке кода:

- Добавление меток классов - если меньше или равно 20 нМ, то соединение можно считать активным

In [25]:
def set_class(x):
    if x['value'] <= 20:
        return 1
    else:
        return 0

In [26]:
data = data.assign(label=data.apply(set_class, axis=1))

In [27]:
data['label'].value_counts()

0    1704
1     390
Name: label, dtype: int64

In [28]:
data = data[['smiles', 'label']]

# Молекулы

События в блоке кода:

- Восстанавление пропущенных значений по индексу Танимото
- Отброс повторов молекул

In [29]:
chembl_nans = chembl_type[chembl_type['Standard Value'].isnull()]
chembl_nans_smiles = chembl_nans['Smiles'].to_list()

chembl_smiles = chembl['smiles'].to_list()

In [30]:
reaxys_nans = reaxys_type[reaxys_type['Quantitative value'].isnull()]
reaxys_nans_smiles = reaxys_nans['SMILES'].to_list()

reaxys_smiles = reaxys['smiles'].to_list()

In [31]:
researched_smiles = chembl_nans_smiles + reaxys_nans_smiles
reference_smiles = chembl_smiles + reaxys_smiles

In [32]:
researched_molecules = []

for smi in researched_smiles:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        researched_molecules.append(mol)
        
reference_molecules = []

for smi in reference_smiles:
    mol = Chem.MolFromSmiles(smi)
    if mol is not None:
        reference_molecules.append(mol)

[11:29:27] Explicit valence for atom # 0 N, 4, is greater than permitted
[11:29:27] Explicit valence for atom # 0 N, 4, is greater than permitted


In [33]:
def tanimoto(reference_molecule, researched_molecule):

    fp = AllChem.GetMorganFingerprintAsBitVect(reference_molecule, 2, nBits = 1024)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(researched_molecule, 2, nBits = 1024)

    same_parts = set(fp.GetOnBits()) & set(fp2.GetOnBits()) 
    different_parts = set(fp.GetOnBits()) | set(fp2.GetOnBits())
    
    return len(same_parts)/len(different_parts)

In [34]:
similarity = {}

for mol_res in researched_molecules:
    tanimoto_list = []
    for mol_ref in reference_molecules:
        sim = tanimoto(mol_ref, mol_res)
        tanimoto_list.append(sim)
    sim_max = max(tanimoto_list)
    mol_sim_max = reference_molecules[tanimoto_list.index(sim_max)]
    similarity[Chem.MolToSmiles(mol_res)] = [Chem.MolToSmiles(mol_sim_max), sim_max]

In [35]:
ref_smiles = []

for key in similarity:
    ref_smi = similarity[key][0]
    ref_smiles.append(ref_smi)

In [36]:
res_smiles = list(similarity.keys())

In [37]:
ref_labels = {}

for smi in ref_smiles:
    if smi in reference_smiles:
        label = data[data['smiles']==smi]['label'].values[0]
        ref_labels[smi] = label

In [38]:
res_labels = [0] * len(res_smiles)

In [39]:
new_mols = dict(zip(res_smiles, res_labels))

In [40]:
new_mols_df = pd.DataFrame(list(new_mols.items()),
                      columns=['smiles','label'])

In [41]:
data = pd.concat([data, new_mols_df], axis=0)

In [42]:
data

Unnamed: 0,smiles,label
0,COC1=C2C(=O)C3=C(C(O)=C4C[C@](O)(C[C@H](O[C@H]...,0
1,COC1=C2C(=O)C3=C(C(O)=C4C[C@](O)(C[C@H](O[C@H]...,0
8,[NH3][Pt]1([NH3])OC(=O)C2(CCC2)C(=O)O1,0
9,[NH3][Pt]1([NH3])OC(=O)C2(CCC2)C(=O)O1,0
11,[H][C@@]12C[C@H](O)[C@@]3(C)C(=O)[C@H](O)C4=C(...,0
...,...,...
17,C=CC(=O)NC(CC)c1ccc(-c2ccc([C@H](C)C(=O)Nc3cc(...,0
18,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,0
19,CN1CC[C@H](c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc...,0
20,Cn1cnc2c(NCc3ccccc3)nc(NCCO)nc21,0


In [43]:
data['smiles'].nunique()

1420

In [46]:
df = data.drop_duplicates()

In [47]:
df

Unnamed: 0,smiles,label
0,COC1=C2C(=O)C3=C(C(O)=C4C[C@](O)(C[C@H](O[C@H]...,0
8,[NH3][Pt]1([NH3])OC(=O)C2(CCC2)C(=O)O1,0
11,[H][C@@]12C[C@H](O)[C@@]3(C)C(=O)[C@H](O)C4=C(...,0
13,[H].O.[Na],0
137,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)N1C3=C(C=C...,0
...,...,...
16,Cc1ccc([N+](=O)[O-])cc1S(=O)(=O)N(C)/N=C/c1cnn...,0
17,C=CC(=O)NC(CC)c1ccc(-c2ccc([C@H](C)C(=O)Nc3cc(...,0
18,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,0
20,Cn1cnc2c(NCc3ccccc3)nc(NCCO)nc21,0


In [49]:
df.groupby(by='label').count()

Unnamed: 0_level_0,smiles
label,Unnamed: 1_level_1
0,1256
1,313


# Сохранение

События в блоке кода:

- Формирование датасета
- Сохранение в формате .csv

In [50]:
df.to_csv('data.csv')