In [1]:
import networkx as nx
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem, rdFingerprintGenerator
from joblib import Parallel, delayed

In [2]:
train = pd.read_csv(
    '../DrugCell/data_rcellminer/train_rcell_wo_other.txt',
    header=None,
    sep='\t'
)

In [3]:
train

Unnamed: 0,0,1,2
0,MDAMB435S_SKIN,C1CC(=O)NC(=O)C1NC(=O)N(CCCl)N=O,0.328319
1,MALME3M_SKIN,CC1=C2[C@H](C(=O)[C@@]3([C@H](C[C@@H]4[C@]([C@...,0.663180
2,NCIH522_LUNG,CC1=CC=CC=C1[C@@H]([C@H](C(=O)O[C@H]2C[C@]3([C...,0.232331
3,NCIH226_LUNG,C=C(CN1CCC2=CC=CC=C2C1)C3=CC=C(C=C3)/C=C/C(=O)...,0.574310
4,KM12_LARGE_INTESTINE,CC1C(C(C(C(O1)O[C@H]2CC[C@@]3([C@H]4CC[C@@]5([...,-0.780455
...,...,...,...
9825,U251MG_CENTRAL_NERVOUS_SYSTEM,CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=...,0.982184
9826,OVCAR8_OVARY,CC1=C(C=CC(=C1)N(CCCl)CCCl)C=O,-0.526061
9827,RPMI8226_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,CC[C@@]1(CC2C[C@@](C3=C(CCN(C2)C1)C4=CC=CC=C4N...,0.378031
9828,COLO205_LARGE_INTESTINE,C[C@]12[C@H]([C@H](C[C@H](O1)N3C4=CC=CC=C4C5=C...,0.763095


In [4]:
pd.DataFrame(set(train[1])).reset_index().to_csv(
    '../DrugCell/data_rcellminer/drug2ind.txt', sep='\t', 
    header=None, index=None
)

In [5]:
mfp = np.array([
 np.array(
  AllChem.GetMorganFingerprintAsBitVect(
    Chem.MolFromSmiles(i), 
    useChirality=True, 
    radius=2, 
    nBits=2048
  )) for i in set(train[1])
])

In [6]:
pd.DataFrame(mfp).to_csv(
    '../DrugCell/data_rcellminer/drug2fingerprint.csv', sep=',', 
    header=None, index=None
)

In [7]:
pd.DataFrame(set(train[0])).reset_index().to_csv(
    '../DrugCell/data_rcellminer/cell2ind.txt',
    header=None,
    index=None,
    sep='\t'
)

In [8]:
pd.DataFrame(set(train[0])).reset_index()

Unnamed: 0,index,0
0,0,LOXIMVI_SKIN
1,1,MALME3M_SKIN
2,2,COLO205_LARGE_INTESTINE
3,3,HOP92_LUNG
4,4,OVCAR8_OVARY
5,5,RXF393_KIDNEY
6,6,SF539_CENTRAL_NERVOUS_SYSTEM
7,7,SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
8,8,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
9,9,OVCAR5_OVARY


In [9]:
cell2ind = pd.read_csv(
    '../DrugCell/data/cell2ind.txt',
    header=None,
    sep='\t'
)

In [10]:
t = pd.read_csv(
    '../DrugCell/data_rcellminer/cell2ind.txt',
    header=None,
    sep='\t'
)

In [11]:
cell2mut = pd.read_csv(
    '../DrugCell/data/cell2mutation.txt',
    header=None,
).loc[
    [int(cell2ind[cell2ind[1] == i][0]) for i in t[1]]
].reset_index(drop=True)

In [12]:
t

Unnamed: 0,0,1
0,0,LOXIMVI_SKIN
1,1,MALME3M_SKIN
2,2,COLO205_LARGE_INTESTINE
3,3,HOP92_LUNG
4,4,OVCAR8_OVARY
5,5,RXF393_KIDNEY
6,6,SF539_CENTRAL_NERVOUS_SYSTEM
7,7,SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
8,8,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
9,9,OVCAR5_OVARY


In [13]:
none_zero_cols = list(np.sum(cell2mut) != 0)

In [14]:
cell2mut = cell2mut.loc[:, none_zero_cols]
cell2mut = cell2mut.T.reset_index(drop=True).T
cell2mut.to_csv(
    '../DrugCell/data_rcellminer/cell2mut.txt',
    header=None,
#     sep='\t',
    index=None
)

In [15]:
gene2ind = pd.read_csv(
    '../DrugCell/data/gene2ind.txt',
    header=None,
    sep='\t'
)
gene2ind = gene2ind.loc[none_zero_cols]
pd.DataFrame(list(gene2ind[1])).to_csv(
    '../DrugCell/data_rcellminer/gene2ind.txt',
    header=None,
    sep='\t'
)

In [16]:
graph = pd.read_csv(
    '../DrugCell/data/drugcell_ont.txt',
    header=None,
    sep='\t'
)

In [17]:
gene = graph[graph[2] == 'gene']

In [18]:
g = pd.DataFrame()
for i in gene2ind[1]:
    g = pd.concat([
        g,
        gene[gene[1] == i]
    ])

In [19]:
go = pd.concat([
    graph[graph[2] == 'default'],
    g
])

In [20]:
go.reset_index(drop=True).to_csv(
    '../DrugCell/data_rcellminer/go.txt',
    header=None,
    sep='\t',
    index=None
)