In [1]:
import sys
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

from IPython.display import Markdown
from os import listdir
from os.path import exists
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.decomposition import PCA

import warnings

warnings.filterwarnings("ignore")

In [2]:
test = pd.read_csv('../DrugCell/data_rcellminer/test_DNA.txt', sep='\t', header=None)

In [3]:
hidden_data_path = '../DrugCell/code/Hidden/'
hidden = listdir(hidden_data_path)
hidden = [i for i in hidden if "GO" in i]

In [4]:
table = pd.DataFrame(
    np.zeros((len(set(test[0])), len(set(test[1])))),
    index=set(test[0]), columns=set(test[1])
).astype(int)

In [5]:
for i in table.index:
    for j in table.columns:
        if len(test[(test[0] == i) & (test[1] == j)]) > 0:
            table[j][i] = 1

In [6]:
col_sorted = list(pd.DataFrame(np.sum(table)).sort_values(0, ascending=False).index)
ind_sorted = list(pd.DataFrame(np.sum(table, axis=1)).sort_values(0, ascending=False).index)
table = table.loc[:, col_sorted]

In [7]:
cols = table.columns
g = table[(table[cols[1]] == 1) & (table[cols[2]] == 1) & (table[cols[3]] == 1)]
g = g.loc[:, np.sum(g) == 9]

In [8]:
def get_pca(hidden, hidden_data_path, indexes):
    pca = pd.DataFrame()
    
    print("Start to calculate the PCA of each hidden layer.")
    for i in tqdm(hidden):
        X = pd.read_csv(
            hidden_data_path + i,
            sep=" ",
            header=None,
        )
        
        X = X.loc[indexes]
        X_pca = PCA(n_components=1).fit_transform(X)
        pca = pd.concat([
            pca, 
            pd.DataFrame(X_pca)
        ], axis=1)

    pca.columns = [i.split('.')[0] for i in hidden]
    
    return pca

In [9]:
t = pd.DataFrame()
for i in g.index:
    for l in g.columns:
        t = pd.concat([t, test[(test[0] == i) & (test[1] == l)].drop(2, axis=1)])
t = t.drop_duplicates()

In [10]:
GO_terms = get_pca(hidden, hidden_data_path, list(t.index))

Start to calculate the PCA of each hidden layer.


100%|██████████| 2086/2086 [00:32<00:00, 63.77it/s]


In [11]:
t = t.reset_index(drop=True)
t.columns = ['Cell Line', 'SMILES']
tmp = pd.read_csv('../data/nsc_cid_smiles.csv')
translater = {tmp['SMILES'][i]: tmp['CID'][i] for i in tmp.index}
t['CID'] = [translater[i] for i in t['SMILES']]

In [12]:
res = pd.DataFrame()

for i in GO_terms.columns:
    t['GO_terms'] = [i]*len(GO_terms.index)
    t['PC1'] = GO_terms[i]
    res = pd.concat([res, t])
    
res = res.sort_values(['CID', 'GO_terms']).reset_index(drop=True)

In [13]:
res

Unnamed: 0,Cell Line,SMILES,CID,GO_terms,PC1
0,MDAMB435S_SKIN,CC1=C(C(=O)C2=C(C1=O)N3CC4C(C3(C2COC(=O)N)OC)N4)N,5702003,GO:0000038,-0.125573
1,U251MG_CENTRAL_NERVOUS_SYSTEM,CC1=C(C(=O)C2=C(C1=O)N3CC4C(C3(C2COC(=O)N)OC)N4)N,5702003,GO:0000038,-0.125573
2,OVCAR8_OVARY,CC1=C(C(=O)C2=C(C1=O)N3CC4C(C3(C2COC(=O)N)OC)N4)N,5702003,GO:0000038,-0.125573
3,786O_KIDNEY,CC1=C(C(=O)C2=C(C1=O)N3CC4C(C3(C2COC(=O)N)OC)N4)N,5702003,GO:0000038,-0.125573
4,NCIH322_LUNG,CC1=C(C(=O)C2=C(C1=O)N3CC4C(C3(C2COC(=O)N)OC)N4)N,5702003,GO:0000038,1.004581
...,...,...,...,...,...
56317,NCIH322_LUNG,C1CC1C(=O)N2CCN(CC2)C(=O)C3=C(C=CC(=C3)CC4=NNC...,23725625,GO:2001259,0.698820
56318,BT549_BREAST,C1CC1C(=O)N2CCN(CC2)C(=O)C3=C(C=CC(=C3)CC4=NNC...,23725625,GO:2001259,-0.554323
56319,NCIH226_LUNG,C1CC1C(=O)N2CCN(CC2)C(=O)C3=C(C=CC(=C3)CC4=NNC...,23725625,GO:2001259,-0.554323
56320,NCIH23_LUNG,C1CC1C(=O)N2CCN(CC2)C(=O)C3=C(C=CC(=C3)CC4=NNC...,23725625,GO:2001259,1.612866


In [14]:
res.to_csv('PC1_for_27_samples.csv', index=False)