数据处理
训练/测试数据：
来自[O'Neil](http://www.bioinf.jku.at/software/DeepSynergy/labels.csv)
来自[DrugCombDB](https://drugcomb.fimm.fi/)
构图数据：
边：
- 药物-药物结构相似性：
[DrugBank](https://go.drugbank.com/releases/latest)
[PubChem](https://pubchem.ncbi.nlm.nih.gov/)
[CHEMBL](https://www.ebi.ac.uk/chembl/)
获取SMILES 经过rdkit计算分子相似性
- 药物-靶蛋白关联关系：
[DrugBank](https://go.drugbank.com/releases/latest)
[STITCH](http://stitch.embl.de/)
[CHEMBL](https://www.ebi.ac.uk/chembl/)
寻找药物的target
- 癌细胞系-蛋白质关联关系：
[CCLE](https://sites.broadinstitute.org/ccle/)
癌细胞系突变改变的蛋白质
- 蛋白质相互作用：
[STRING](https://cn.string-db.org/)
[BioGRID](https://thebiogrid.org/)
节点：
- 药物特征：
rdkit描述符计算(https://blog.csdn.net/qq_36801966/article/details/107028297)
分子图
- 蛋白质特征：
用Python的iFeature包计算蛋白质结构、物理化学描述符(https://github.com/Superzchen/iFeature)
- 癌细胞特征：
ArrayExpress基因表达谱

In [11]:
"""
O'Neil 数据集
col: [ , drug_a_name, drug_b_name, cell_line, synergy, fold]
synergy >= 30 ->协同； <30 ->不协同
按照Fold分成5-fold
"""

import pandas as pd
import os

raw_data = pd.read_csv("./data/oneil/labels.csv")

for index, item in raw_data.iterrows():
    if item['synergy'] >= 30.0:
        raw_data.loc[index, 'label'] = 1
    else:
        raw_data.loc[index, 'label'] = 0
raw_data['label'] = raw_data['label'].astype(int)

folded_data = raw_data.groupby('fold')
for key in range(5):
    test_fold = folded_data.get_group(key)
    train = []
    for train_key in range(5):
        if train_key != key:
            train.append(folded_data.get_group(train_key))
    train_fold = pd.concat(train)

    output_dir = "./data/oneil/fold" + str(key)
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    test_fold.to_csv(os.path.join(output_dir, "test.csv"))
    train_fold.to_csv(os.path.join(output_dir, "train.csv"))

raw_data.to_csv("./data/oneil/processed_label_data.csv")

In [5]:
from time import sleep

"""
计算药物结构相似性：
1. 先获得oneil数据drug列表
2. 再获得SMILES
3. 利用rdkit计算相似性
"""

# 获得SMILES
import pubchempy as pcp
import pandas as pd
import pickle

raw_data = pd.read_csv("./data/oneil/processed_label_data.csv")
drug_list = set(raw_data['drug_a_name']).union(set(raw_data['drug_b_name']))

drug_dict = {}
for drug_name in drug_list:
    try:
        drug_c = pcp.get_compounds(drug_name, "name")
        drug_dict[drug_name] = drug_c
    except Exception as e:
        sleep(5)
        drug_c = pcp.get_compounds(drug_name, "name")
        drug_dict[drug_name] = drug_c

with open("./data/oneil/drug_dict.pkl", "wb") as tf:
    pickle.dump(drug_dict, tf)

simple_drug_dict = {}
for key in drug_dict.keys():
    simple_drug_dict[key] = {"cid": drug_dict[key][0].cid, "smiles": drug_dict[key][0].isomeric_smiles}

with open("./data/oneil/simple_drug_dict.pkl", "wb") as tf:
    pickle.dump(simple_drug_dict, tf)

In [11]:
import itertools
import pickle
from rdkit import Chem, DataStructs
import pandas as pd

# 计算结构相似性
with open("./data/oneil/simple_drug_dict.pkl", "rb") as tf:
    simple_drug_dict = pickle.load(tf)

for key in simple_drug_dict.keys():
    mol = Chem.MolFromSmiles(simple_drug_dict[key][0]['smiles'])
    simple_drug_dict[key][0]['RDKFingerprint'] = Chem.RDKFingerprint(mol)

combs = itertools.combinations(simple_drug_dict.keys(), 2)

simi_matrix = pd.DataFrame(index=simple_drug_dict.keys(), columns=simple_drug_dict.keys())

for index, row in simi_matrix.iterrows():
    for col in simi_matrix.columns:
        if index == col:
            simi_matrix.loc[index, col] = 0.0
        else:
            simi_matrix.loc[index, col] = DataStructs.FingerprintSimilarity(
                simple_drug_dict[index][0]['RDKFingerprint'],
                simple_drug_dict[col][0]['RDKFingerprint']
            )

simi_matrix.to_csv("./data/oneil/graph/drug-drug.csv")

In [None]:
'''
药物-蛋白质关联关系：
等DrugBank
'''



In [8]:
"""
癌细胞系-蛋白质关联关系：
CCLE 癌细胞系突变影响的蛋白质
"""
import pandas as pd

mutation_data = pd.read_csv("./data/OmicsSomaticMutationsMatrixDamaging.csv")




In [None]:
"""
蛋白质相互作用
"""

In [6]:
from tqdm import tqdm

'''
药物特征：
rdkit 计算
'''

import pickle
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
import pandas as pd

with open("./data/oneil/simple_drug_dict.pkl", 'rb') as tf:
    simple_drug_dict = pickle.load(tf)

# 一共209个描述符
all_tuple = Descriptors._descList
descList = []
for t in all_tuple:
    descList.append(t[0])

calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descList)

drug_feature = []
for key in tqdm(simple_drug_dict.keys(), desc="calculating mol desc"):
    smiles = simple_drug_dict[key]['smiles']
    desc = calculator.CalcDescriptors(Chem.MolFromSmiles(smiles))
    simple_drug_dict[key]['desc'] = desc
    drug_feature.append(list(desc))


drug_matrix = pd.DataFrame(index=simple_drug_dict.keys(), columns=["feature_"+str(i) for i in range(209)], data=drug_feature)

drug_matrix.to_csv("./data/oneil/graph/node_drug.csv")

calculating mol desc: 100%|██████████| 38/38 [00:00<00:00, 73.34it/s]


In [14]:
import pandas as pd

our_f_cell = pd.read_csv("./data/ous/Feature_CELL.csv")
our_cell = list(our_f_cell['Cell_Line_Name'])

for i in range(len(our_cell)):
    our_cell[i] = our_cell[i].replace("-","")
our_cell = pd.DataFrame(sorted(set(our_cell)))

d_c_f = pd.read_csv("./data/DrugCombDB/drug_combinations.csv")
d_c_cell = set(d_c_f['cell'])

# union = our_cell.union(d_c_cell)
# inter = our_cell.intersection(d_c_cell)
# #
# print(our_cell)
# print(d_c_cell)