# Dataset report: Overlap

In [1]:
import altair as alt
alt.data_transformers.disable_max_rows()
import numpy as np
import pandas as pd

import cytoxnet.dataprep.io
import cytoxnet.dataprep.featurize

### load all of the datasets and concat and label sources

In [2]:
lunghini_fish = cytoxnet.dataprep.io.load_data('lunghini_fish_LC50')
lunghini_daphnia = cytoxnet.dataprep.io.load_data('lunghini_daphnia_EC50')
lunghini_algea = cytoxnet.dataprep.io.load_data('lunghini_algea_EC50')
zhu_rat = cytoxnet.dataprep.io.load_data('zhu_rat_LD50')
chembl_ecoli = cytoxnet.dataprep.io.load_data('chembl_ecoli_MIC')

In [3]:
sets_cleaned = []
locals_ = locals()
for set_name in ['lunghini_fish',
                 'lunghini_daphnia',
                 'lunghini_algea',
                 'zhu_rat',
                 'chembl_ecoli']:
    set_ = locals_[set_name]
    set_['source'] = set_name
    set_ = set_[['smiles', 'source']]
    sets_cleaned.append(set_)
dataframe = pd.concat(sets_cleaned, ignore_index=True)

In [4]:
dataframe

Unnamed: 0,smiles,source
0,C=C(Br)c1ccccc1,lunghini_fish
1,BrC(Br)Br,lunghini_fish
2,N#CC(Br)Br,lunghini_fish
3,BrC(Br)c1ccccc1C(Br)Br,lunghini_fish
4,BrC(Br)c1ccccc1OCC1CO1,lunghini_fish
...,...,...
18397,C#CCN(C)Cc1cc2cc(OCc3ccccc3)ccc2[nH]1.O=C(O)C(...,chembl_ecoli
18398,CC1CC(=O)Nc2c1c(Br)nc1nc3c(c(N)c21)CCCC3,chembl_ecoli
18399,COc1ccc(C(=N)Nc2cc(C(=O)Nc3nc(C(=O)NCCN4CCOCC4...,chembl_ecoli
18400,Cc1ccc2nc(Cl)c(/C=N/O)cc2c1,chembl_ecoli


### featurize the dataset


In [5]:
cytoxnet.dataprep.featurize.molstr_to_Mol(dataframe, 'smiles')
dataframe = cytoxnet.dataprep.featurize.add_features(dataframe)

## apply UMAP

In [6]:
import umap.umap_ as umap

In [7]:
dataframe['CircularFingerprint']

0        [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1        [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2        [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3        [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4        [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
                               ...                        
18397    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
18398    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
18399    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...
18400    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
18401    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: CircularFingerprint, Length: 18402, dtype: object

In [8]:
%%time
umap_model = umap.UMAP(metric = "jaccard",
                      n_neighbors = 25,
                      n_components = 2,
                      low_memory = False,
                      min_dist = 0.001)
X_umap = umap_model.fit_transform(np.vstack(dataframe['CircularFingerprint'].values))
dataframe["UMAP_0"], dataframe["UMAP_1"] = X_umap[:,0], X_umap[:,1]

  "inverse_transform will be unavailable".format(self.metric)
  "Failed to correctly find n_neighbors for some samples."
Disconnection_distance = 1 has removed 25 edges.
It has only fully disconnected 1 vertices.
Use umap.utils.disconnected_vertices() to identify them.
  f"A few of your vertices were disconnected from the manifold.  This shouldn't cause problems.\n"


CPU times: user 1min 5s, sys: 1 s, total: 1min 6s
Wall time: 24.7 s


In [9]:
## remove non bitwise columns
dataset_ = dataframe[['UMAP_0', 'UMAP_1', 'source']]

In [11]:
selection = alt.selection_multi(fields=['source'], bind='legend')
alt.Chart(dataset_).mark_circle(size=60).encode(
    x='UMAP_0',
    y='UMAP_1',
    color=alt.Color('source:N', scale=alt.Scale(scheme='category20b')),
    opacity=alt.condition(selection, alt.value(1), alt.value(0.01))
).add_selection(selection)

