# Dataset report: Overlap

In [1]:
import altair as alt
alt.data_transformers.disable_max_rows()
import numpy as np
import pandas as pd

import cytoxnet.dataprep.io
import cytoxnet.dataprep.featurize

### load all of the datasets and concat and label sources

In [2]:
lunghini_fish = cytoxnet.dataprep.io.load_lunghini(species=['fish'])
lunghini_daphnia = cytoxnet.dataprep.io.load_lunghini(species=['daphnia'])
lunghini_algea = cytoxnet.dataprep.io.load_lunghini(species=['algea'])
zhu_rat = cytoxnet.dataprep.io.load_zhu_rat()
chembl_ecoli = cytoxnet.dataprep.io.load_chembl_ecoli()

In [3]:
sets_cleaned = []
locals_ = locals()
for set_name in ['lunghini_fish',
                 'lunghini_daphnia',
                 'lunghini_algea',
                 'zhu_rat',
                 'chembl_ecoli']:
    set_ = locals_[set_name]
    set_['source'] = set_name
    set_ = set_[['smiles', 'source']]
    sets_cleaned.append(set_)
dataframe = pd.concat(sets_cleaned, ignore_index=True)

In [4]:
dataframe

Unnamed: 0,smiles,source
0,BrC(=C)c1ccccc1,lunghini_fish
1,BrC(Br)Br,lunghini_fish
2,BrC(Br)C#N,lunghini_fish
3,BrC(Br)c1ccccc1C(Br)Br,lunghini_fish
4,BrC(Br)c1ccccc1OCC1CO1,lunghini_fish
...,...,...
17679,C#CCN(C)Cc1cc2cc(OCc3ccccc3)ccc2[nH]1.O=C(O)C(...,chembl_ecoli
17680,CC1CC(=O)Nc2c1c(Br)nc1nc3c(c(N)c21)CCCC3,chembl_ecoli
17681,COc1ccc(C(=N)Nc2cc(C(=O)Nc3nc(C(=O)NCCN4CCOCC4...,chembl_ecoli
17682,Cc1ccc2nc(Cl)c(/C=N/O)cc2c1,chembl_ecoli


### featurize the dataset


In [5]:
cytoxnet.dataprep.featurize.molstr_to_Mol(dataframe, 'smiles')
cytoxnet.dataprep.featurize.add_features(dataframe)

Unnamed: 0,smiles,source,Mol,CircularFingerprint
0,BrC(=C)c1ccccc1,lunghini_fish,<rdkit.Chem.rdchem.Mol object at 0x199ddb350>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,BrC(Br)Br,lunghini_fish,<rdkit.Chem.rdchem.Mol object at 0x199ddb3a0>,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,BrC(Br)C#N,lunghini_fish,<rdkit.Chem.rdchem.Mol object at 0x199ddb3f0>,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,BrC(Br)c1ccccc1C(Br)Br,lunghini_fish,<rdkit.Chem.rdchem.Mol object at 0x199ddb440>,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,BrC(Br)c1ccccc1OCC1CO1,lunghini_fish,<rdkit.Chem.rdchem.Mol object at 0x199ddb490>,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
17679,C#CCN(C)Cc1cc2cc(OCc3ccccc3)ccc2[nH]1.O=C(O)C(...,chembl_ecoli,<rdkit.Chem.rdchem.Mol object at 0x1c3711c10>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
17680,CC1CC(=O)Nc2c1c(Br)nc1nc3c(c(N)c21)CCCC3,chembl_ecoli,<rdkit.Chem.rdchem.Mol object at 0x1c3711c60>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
17681,COc1ccc(C(=N)Nc2cc(C(=O)Nc3nc(C(=O)NCCN4CCOCC4...,chembl_ecoli,<rdkit.Chem.rdchem.Mol object at 0x1c3711cb0>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
17682,Cc1ccc2nc(Cl)c(/C=N/O)cc2c1,chembl_ecoli,<rdkit.Chem.rdchem.Mol object at 0x1c3711d00>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## apply UMAP

In [6]:
import umap

In [10]:
%%time
umap_model = umap.UMAP(metric = "jaccard",
                      n_neighbors = 25,
                      n_components = 2,
                      low_memory = False,
                      min_dist = 0.001)
X_umap = umap_model.fit_transform(np.vstack(dataframe['CircularFingerprint'].values))
dataframe["UMAP_0"], dataframe["UMAP_1"] = X_umap[:,0], X_umap[:,1]

  "inverse_transform will be unavailable".format(self.metric)
  "Failed to correctly find n_neighbors for some samples."


CPU times: user 1min 8s, sys: 606 ms, total: 1min 8s
Wall time: 18.6 s


In [11]:
## remove non bitwise columns
dataset_ = dataframe[['UMAP_0', 'UMAP_1', 'source']]

In [12]:
selection = alt.selection_multi(fields=['source'], bind='legend')
alt.Chart(dataset_).mark_circle(size=60).encode(
    x='UMAP_0',
    y='UMAP_1',
    color=alt.Color('source:N', scale=alt.Scale(scheme='category20b')),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.01))
).add_selection(selection)

In [None]:
import altair as alt
alt.data_transformers.disable_max_rows()
import numpy as np
import pandas as pd

import cytoxnet.dataprep.io
import cytoxnet.dataprep.featurize

In [2]:
cytoxnet.dataprep.io.create_data_codex(featurizer=['CircularFingerprint'])

In [3]:
chembl_ecoli = cytoxnet.dataprep.io.load_chembl_ecoli()

In [4]:
chembl_ecoli

Unnamed: 0_level_0,smiles,MIC
Molecule ChEMBL ID,Unnamed: 1_level_1,Unnamed: 2_level_1
CHEMBL4545569,C[C@@](CCN1Cc2cc(C#CC#CCOC(=O)N3CCC(O)CC3)cn2C...,0.50
CHEMBL4462541,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,32.00
CHEMBL4463662,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,64.00
CHEMBL4476233,CO[C@]1(C)C[C@H](O[C@H]2[C@H](C)[C@@H](O[C@@H]...,128.00
CHEMBL4593661,C[C@]1(CO)C[C@@H]1C#CC#Cc1cc2n(c1)C(=O)N(CC[C@...,0.25
...,...,...
CHEMBL2022927,C#CCN(C)Cc1cc2cc(OCc3ccccc3)ccc2[nH]1.O=C(O)C(...,32.00
CHEMBL4536413,CC1CC(=O)Nc2c1c(Br)nc1nc3c(c(N)c21)CCCC3,32.00
CHEMBL4551417,COc1ccc(C(=N)Nc2cc(C(=O)Nc3nc(C(=O)NCCN4CCOCC4...,32.00
CHEMBL4543834,Cc1ccc2nc(Cl)c(/C=N/O)cc2c1,32.00


In [5]:
cytoxnet.dataprep.io.add_dataset(chembl_ecoli, new_featurizer=['MACCSKeysFingerprint'])

./data_codex.csv


In [3]:
start = pd.DataFrame(lunghini_fish['smiles'])
start.to_csv('../../cytoxnet/data/codex.csv')

In [3]:
cytoxnet.dataprep.io.add_dataset(lunghini_algea, 'smiles')

UnsupportedOperation: write

In [11]:
lunghini_algea

Unnamed: 0,smiles,algea_EC50
4,BrC(Br)c1ccccc1OCC1CO1,0.415
7,BrC=Cc1ccccc1,50.400
10,BrCC(Br)COP(=O)(OCC(Br)CBr)OCC(Br)CBr,2.400
13,BrCC=Cc1ccccc1,18.920
14,BrCCBr,25.940
...,...,...
3672,c1ccsc1,80.000
3676,c1nc2ccccc2[nH]1,26.800
3677,c1nc2ccccc2s1,48.700
3678,c1nc[nH]n1,51.550


In [6]:
import cytoxnet.dataprep.featurize as ft
import deepchem as dc

In [11]:
def add_dataset(dataframe1, dataframe2, id_col, new_featurizer=None):
    assert id_col in dataframe1.columns, "dataframe should have `id_col`"
    
    master = dataframe2.copy()
    dataframe_ = dataframe1.copy()
    dataframe_.rename(
        columns={id_col: 'smiles'}, inplace=True
    )
    
    # first extract the non duplicate molecules
    common = dataframe_.merge(master, on='smiles')['smiles']
    uniques = pd.DataFrame(dataframe_[
        ~dataframe_['smiles'].isin(common)
    ]['smiles'])
    
    # create a mol object for these smiles
    uniques = ft.molstr_to_Mol(uniques, strcolumnID='smiles')
    
    # compute the features already in the codex for these unique values
    for col_name in master.columns:
        if hasattr(dc.feat, col_name):
            uniques = ft.add_features(uniques,
                                      MolcolumnID='Mol',
                                      method=col_name)

    # add these new values to the codex
    master = pd.concat([master, uniques], ignore_index=True)
    
    if new_featurizer is not None:
        assert all([type(f) == str for f in new_featurizer]),\
            "new_featurizer should be a list of featurizers to use"
        master = ft.molstr_to_Mol(master, strcolumnID='smiles')
        for f in new_featurizer:
            master = ft.add_features(master,
                                     MolcolumnID='Mol',
                                     method=f)
    
    return master

In [8]:
master = add_dataset(lunghini_algea, lunghini_fish, 'smiles')

In [9]:
lunghini_fish

Unnamed: 0,smiles,fish_LC50
0,BrC(=C)c1ccccc1,0.150000
1,BrC(Br)Br,23.500000
2,BrC(Br)C#N,0.550000
3,BrC(Br)c1ccccc1C(Br)Br,0.437000
4,BrC(Br)c1ccccc1OCC1CO1,1.300000
...,...,...
3672,c1ccsc1,31.000000
3674,c1csc(c1)-c1nc(no1)-c1ccccc1,0.076000
3675,c1nc(cs1)-c1nc2ccccc2[nH]1,2.980000
3677,c1nc2ccccc2s1,53.981006


In [10]:
master

Unnamed: 0,smiles,fish_LC50,Mol
0,BrC(=C)c1ccccc1,0.150,
1,BrC(Br)Br,23.500,
2,BrC(Br)C#N,0.550,
3,BrC(Br)c1ccccc1C(Br)Br,0.437,
4,BrC(Br)c1ccccc1OCC1CO1,1.300,
...,...,...,...
2949,c1ccc2c(c1)cc1ccc3cccc4ccc2c1c34,,<rdkit.Chem.rdchem.Mol object at 0x19c140ad0>
2950,c1ccc2c-3c(ccc2c1)-c1cccc2cccc-3c12,,<rdkit.Chem.rdchem.Mol object at 0x19c140b20>
2951,c1ccc2cc3c(ccc4ccccc34)cc2c1,,<rdkit.Chem.rdchem.Mol object at 0x19c140b70>
2952,c1nc2ccccc2[nH]1,,<rdkit.Chem.rdchem.Mol object at 0x19c140bc0>
