In [1]:
import networkx as nx
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem, rdFingerprintGenerator
from joblib import Parallel, delayed

In [2]:
train = pd.read_csv(
    '../DrugCell/data_rcellminer/train_DNA.txt',
    header=None,
    sep='\t'
)

In [3]:
pd.DataFrame(set(train[1])).reset_index().to_csv(
    '../DrugCell/data_rcellminer/drug2ind.txt', sep='\t', 
    header=None, index=None
)

In [4]:
mfp = np.array([
 np.array(
  AllChem.GetMorganFingerprintAsBitVect(
    Chem.MolFromSmiles(i), 
    useChirality=True, 
    radius=2, 
    nBits=2048
  )) for i in set(train[1])
])

In [5]:
pd.DataFrame(mfp).to_csv(
    '../DrugCell/data_rcellminer/drug2fingerprint.csv', sep=',', 
    header=None, index=None
)

In [6]:
pd.DataFrame(set(train[0])).reset_index().to_csv(
    '../DrugCell/data_rcellminer/cell2ind.txt',
    header=None,
    index=None,
    sep='\t'
)

In [7]:
cell2ind = pd.read_csv(
    '../DrugCell/data/cell2ind.txt',
    header=None,
    sep='\t'
)

In [8]:
t = pd.read_csv(
    '../DrugCell/data_rcellminer/cell2ind.txt',
    header=None,
    sep='\t'
)

In [9]:
cell2mut = pd.read_csv(
    '../DrugCell/data/cell2mutation.txt',
    header=None,
).loc[
    [int(cell2ind[cell2ind[1] == i][0]) for i in t[1]]
].reset_index(drop=True)

In [10]:
none_zero_cols = list(np.sum(cell2mut) != 0)
cell2mut = cell2mut.loc[:, none_zero_cols]
cell2mut = cell2mut.T.reset_index(drop=True).T
cell2mut.to_csv(
    '../DrugCell/data_rcellminer/cell2mut.txt',
    header=None,
#     sep='\t',
    index=None
)

In [11]:
gene2ind = pd.read_csv(
    '../DrugCell/data/gene2ind.txt',
    header=None,
    sep='\t'
)
gene2ind = gene2ind.loc[none_zero_cols]
gene2ind = pd.DataFrame(list(gene2ind[1]))
gene2ind.to_csv(
    '../DrugCell/data_rcellminer/gene2ind.txt',
    header=None,
    sep='\t'
)

In [12]:
gene2ind

Unnamed: 0,0
0,AADAC
1,AADAT
2,AAK1
3,AARS
4,AASDHPPT
...,...
2724,YWHAG
2725,YWHAH
2726,ZAP70
2727,ZMPSTE24


In [13]:
graph = pd.read_csv(
    '../DrugCell/data/drugcell_ont.txt',
    header=None,
    sep='\t'
)

In [14]:
gene = graph[graph[2] == 'gene']

In [15]:
gene

Unnamed: 0,0,1,2
3167,GO:0007005,ATG7,gene
3168,GO:0007005,NDUFS8,gene
3169,GO:0007005,GBA,gene
3170,GO:0007005,LRRK2,gene
3171,GO:0007005,TERT,gene
...,...,...,...
62915,GO:2001240,BCL2L1,gene
62916,GO:2001240,AKT1,gene
62917,GO:0008150,LYPLAL1,gene
62918,GO:0008150,ABHD11,gene


In [16]:
g = pd.DataFrame()
for i in gene2ind[0]:
    g = pd.concat([
        g,
        gene[gene[1] == i]
    ])

In [17]:
go = pd.concat([
    graph[graph[2] == 'default'],
    g
])

In [18]:
go.reset_index(drop=True).to_csv(
    '../DrugCell/data_rcellminer/go.txt',
    header=None,
    sep='\t',
    index=None
)