# DeepChem datasets and splitting

# RandomSplitting, ScaffoldSplitting, and RandomScaffoldSplitting


In [None]:
# %tensorflow_version 1.x
# !wget -c https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
# !chmod +x Miniconda3-latest-Linux-x86_64.sh
# !bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
# !conda install -y -c deepchem -c rdkit -c conda-forge -c omnia deepchem-gpu=2.3.0
# import sys
# sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [None]:
%%capture
!pip install kora
!pip install dgl
!pip install dgl-cu101
!pip install --pre dgl-cu101
!pip install dgllife
!pip install tensorflow~=2.4
!pip install deepchem
import os
import numpy as np
import tensorflow as tf
import kora.install.rdkit
import rdkit
import deepchem as dc

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
os.environ['DEEPCHEM_DATA_DIR'] = '/content/drive/MyDrive/GMLG_Research/Data/DeepChem_RandomScaffoldSplit/'
dc.utils.get_data_dir()

'/content/drive/MyDrive/GMLG_Research/Data/DeepChem_RandomScaffoldSplit/'

## RandomSplitting

In [None]:
def seed_set(seed=1):
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
# seed_set(213)
# tasks2, datasets2, transformers2 = dc.molnet.load_tox21(splitter=dc.splits.RandomSplitter(), reload=False)

In [None]:
# datasets2[0].ids

## Vanila Scaffold Splitting

In [None]:
# tasks, datasets, transformers = dc.molnet.load_tox21(splitter="scaffold")

In [None]:
# os.listdir('/content/drive/MyDrive/Colab Notebooks/GNN/Data/DeepChem')

In [None]:
# datasets[0].ids

## Random Scaffold Splitting

In [None]:
# import inspect
import os
from random import Random
import random
import tempfile
import itertools
import logging
import shutil
from typing import Any, Dict, List, Iterator, Optional, Sequence, Tuple

import numpy as np
import pandas as pd

import deepchem as dc
from deepchem.data import Dataset, DiskDataset
from deepchem.utils import get_print_threshold

from deepchem.splits import Splitter

logger = logging.getLogger(__name__)

In [None]:
class RandomScaffoldSplitter(Splitter):
  """Class for doing data splits based on the scaffold of small molecules.
  Note
  ----
  This class requires RDKit to be installed.
  """
  def __init__(self, seed=0):
            self.seed = seed

  def split(self,
            dataset: Dataset,
            frac_train: float = 0.8,
            frac_valid: float = 0.1,
            frac_test: float = 0.1,
            seed: Optional[int] = 213,
            log_every_n: Optional[int] = 1000
           ) -> Tuple[List[int], List[int], List[int]]:
    """
    Splits internal compounds into train/validation/test by scaffold.
    Parameters
    ----------
    dataset: Dataset
      Dataset to be split.
    frac_train: float, optional (default 0.8)
      The fraction of data to be used for the training split.
    frac_valid: float, optional (default 0.1)
      The fraction of data to be used for the validation split.
    frac_test: float, optional (default 0.1)
      The fraction of data to be used for the test split.
    seed: int, optional (default None)
      Random seed to use.
    log_every_n: int, optional (default 1000)
      Controls the logger by dictating how often logger outputs
      will be produced.
    Returns
    -------
    Tuple[List[int], List[int], List[int]]
      A tuple of train indices, valid indices, and test indices.
      Each indices is a list of integers.
    """

    seed = self.seed
    print("Random seed in this scaffold splitting is: ", seed)
    
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)

    train_size = frac_train * len(dataset)
    valid_size = frac_valid * len(dataset)
    test_size = frac_test * len(dataset)
    train_inds: List[int] = []
    valid_inds: List[int] = []
    test_inds: List[int] = []
    # train_scaffold_count, val_scaffold_count, test_scaffold_count = 0, 0, 0

    scaffold_sets = self.generate_scaffolds(dataset)

    # Seed randomness
    random = Random(seed)

    logger.info("About to sort in scaffold sets")

    # Put stuff that's bigger than half the val/test size into train, rest just order randomly
    big_index_sets = []
    small_index_sets = []
    for index_set in scaffold_sets:
        if len(index_set) > valid_size / 2 or len(index_set) > test_size / 2:
            big_index_sets.append(index_set)
        else:
            small_index_sets.append(index_set)
    random.seed(seed)
    random.shuffle(big_index_sets)
    random.shuffle(small_index_sets)
    scaffold_sets = big_index_sets + small_index_sets

    # scaffold_sets2 = scaffold_sets[:]
    # for scaffold_set in scaffold_sets:
    #   if len(scaffold_set) > 0.5 * frac_test * len(dataset):
    #       train_inds += scaffold_set
    #       scaffold_sets2.remove(scaffold_set)

    # if seed is not None:
    #   np.random.seed(seed)
      
    # seed=518
    # rng = np.random.RandomState(seed)
    # scaffold_sets = rng.permutation(scaffold_sets2)

    for index_set in scaffold_sets:
        if len(train_inds) + len(index_set) <= train_size:
            train_inds += index_set
            # train_scaffold_count += 1
        elif len(valid_inds) + len(index_set) <= valid_size:
            valid_inds += index_set
            # val_scaffold_count += 1
        else:
            test_inds += index_set
            # test_scaffold_count += 1
    # for scaffold_set in scaffold_sets:
    #     if len(train_inds) + len(scaffold_set) > train_cutoff:
    #         if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
    #           test_inds += scaffold_set
    #         else:
    #           valid_inds += scaffold_set
    #     else:
    #         train_inds += scaffold_set
    return train_inds, valid_inds, test_inds

  def generate_scaffolds(self, dataset: Dataset,
                         log_every_n: int = 1000) -> List[List[int]]:
    """Returns all scaffolds from the dataset.
    Parameters
    ----------
    dataset: Dataset
      Dataset to be split.
    log_every_n: int, optional (default 1000)
      Controls the logger by dictating how often logger outputs
      will be produced.
    Returns
    -------
    scaffold_sets: List[List[int]]
      List of indices of each scaffold in the dataset.
    """
    scaffolds = {}
    data_len = len(dataset)

    logger.info("About to generate scaffolds")
    for ind, smiles in enumerate(dataset.ids):
      if ind % log_every_n == 0:
        logger.info("Generating scaffold %d/%d" % (ind, data_len))
      scaffold = _generate_scaffold(smiles)
      if scaffold not in scaffolds:
        scaffolds[scaffold] = [ind]
      else:
        scaffolds[scaffold].append(ind)

    # Sort from largest to smallest scaffold sets
    scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
    scaffold_sets = [
        scaffold_set for (scaffold, scaffold_set) in sorted(
            scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True)
    ]
    return scaffold_sets

def _generate_scaffold(smiles: str, include_chirality: bool = False) -> str:
  """Compute the Bemis-Murcko scaffold for a SMILES string.
  Bemis-Murcko scaffolds are described in DOI: 10.1021/jm9602928.
  They are essentially that part of the molecule consisting of
  rings and the linker atoms between them.
  Paramters
  ---------
  smiles: str
    SMILES
  include_chirality: bool, default False
    Whether to include chirality in scaffolds or not.
  Returns
  -------
  str
    The MurckScaffold SMILES from the original SMILES
  References
  ----------
  .. [1] Bemis, Guy W., and Mark A. Murcko. "The properties of known drugs.
     1. Molecular frameworks." Journal of medicinal chemistry 39.15 (1996): 2887-2893.
  Note
  ----
  This function requires RDKit to be installed.
  """
  try:
    from rdkit import Chem
    from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
  except ModuleNotFoundError:
    raise ImportError("This function requires RDKit to be installed.")

  mol = Chem.MolFromSmiles(smiles)
  scaffold = MurckoScaffoldSmiles(mol=mol, includeChirality=include_chirality)
  return scaffold

In [None]:
def copy_fn_seeded_all(seed,data):
    a=os.path.join(save_dir, str(data)+ '-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter_seed_' + str(seed), 'BalancingTransformer/train_dir/shard-0-ids.npy')
    b=os.path.join(save_dir,'train_smiles')
    shutil.copy(a, b)

    a=os.path.join(save_dir, str(data)+ '-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter_seed_' + str(seed), 'BalancingTransformer/valid_dir/shard-0-ids.npy')
    b=os.path.join(save_dir,'val_smiles')
    shutil.copy(a, b)

    a=os.path.join(save_dir, str(data)+ '-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter_seed_' + str(seed), 'BalancingTransformer/test_dir/shard-0-ids.npy')
    b=os.path.join(save_dir,'test_smiles')
    shutil.copy(a, b)

In [None]:
def copy_fn_seeded(seed):
    a=os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter_seed_' + str(seed), 'BalancingTransformer/train_dir/shard-0-ids.npy')
    b=os.path.join(save_dir,'train_smiles')
    shutil.copy(a, b)

    a=os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter_seed_' + str(seed), 'BalancingTransformer/valid_dir/shard-0-ids.npy')
    b=os.path.join(save_dir,'val_smiles')
    shutil.copy(a, b)

    a=os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter_seed_' + str(seed), 'BalancingTransformer/test_dir/shard-0-ids.npy')
    b=os.path.join(save_dir,'test_smiles')
    shutil.copy(a, b)

In [None]:
def copy_fn():
    a=os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy')
    b=os.path.join(save_dir,'train_smiles')
    shutil.copy(a, b)

    a=os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/valid_dir/shard-0-ids.npy')
    b=os.path.join(save_dir,'val_smiles')
    shutil.copy(a, b)

    a=os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/test_dir/shard-0-ids.npy')
    b=os.path.join(save_dir,'test_smiles')
    shutil.copy(a, b)

In [None]:
#seed = 213
#seed= 1372
seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "Tox21_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_tox21(splitter=RandomScaffoldSplitter(seed=seed), save_dir=save_dir)
copy_fn_seeded(seed)
ds50 = datasets[0].ids

In [None]:
#seed = 213
seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "Tox21_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_tox21(splitter=RandomScaffoldSplitter(seed=seed), save_dir=save_dir)
copy_fn_seeded(seed)
ds1372 = datasets[0].ids

In [None]:
seed = 213
#seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "Tox21_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_tox21(splitter=RandomScaffoldSplitter(seed=seed), save_dir=save_dir)
copy_fn_seeded(seed)
ds213 = datasets[0].ids

In [None]:
np.array_equal(ds1372,ds213)

False

In [None]:
np.array_equal(ds1372,ds50)

False

In [None]:
np.array_equal(ds213,ds50)

False

## Generating BBBP

In [None]:
#seed = 213
#seed= 1372
seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "BBBP_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_bbbp(splitter=RandomScaffoldSplitter(seed=seed), save_dir=save_dir)
copy_fn_seeded_all(seed,'bbbp')
ds50 = datasets[0].ids

In [None]:
#seed = 213
seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "BBBP_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_bbbp(splitter=RandomScaffoldSplitter(seed=seed), save_dir=save_dir)
copy_fn_seeded_all(seed,'bbbp')
ds1372 = datasets[0].ids

In [None]:
seed = 213
#seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "BBBP_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_bbbp(splitter=RandomScaffoldSplitter(seed=seed), save_dir=save_dir)
copy_fn_seeded_all(seed,'bbbp')
ds213 = datasets[0].ids

In [None]:
np.array_equal(ds1372,ds213)

False

In [None]:
np.array_equal(ds1372,ds50)

False

In [None]:
np.array_equal(ds213,ds50)

False

## Generating Lipophilicity

In [None]:
#seed = 213
#seed= 1372
seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "Lipo_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_lipo(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'lipo')
ds50 = datasets[0].ids

In [None]:
#seed = 213
seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "Lipo_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_lipo(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'lipo')
ds1372 = datasets[0].ids

In [None]:
seed = 213
#seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "Lipo_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_lipo(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'lipo')
ds213 = datasets[0].ids

In [None]:
np.array_equal(ds1372,ds213)

False

In [None]:
np.array_equal(ds1372,ds50)

False

In [None]:
np.array_equal(ds213,ds50)

False

## Generating ESOL

In [None]:
#seed = 213
#seed= 1372
seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "ESOL_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_delaney(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'delaney')
ds50 = datasets[0].ids

In [None]:
#seed = 213
seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "ESOL_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_delaney(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'delaney')
ds1372 = datasets[0].ids

In [None]:
seed = 213
#seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "ESOL_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_delaney(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'delaney')
ds213 = datasets[0].ids

In [None]:
np.array_equal(ds1372,ds213)

False

In [None]:
np.array_equal(ds1372,ds50)

False

In [None]:
np.array_equal(ds213,ds50)

False

##Generating BACE

In [None]:
#seed = 213
#seed= 1372
seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "BACE_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_bace_classification(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'bace_c')
ds50 = datasets[0].ids

In [None]:
#seed = 213
seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "BACE_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_bace_classification(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'bace_c')
ds1372 = datasets[0].ids

In [None]:
seed = 213
#seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "BACE_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_bace_classification(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'bace_c')
ds213 = datasets[0].ids

In [None]:
np.array_equal(ds1372,ds213)

False

In [None]:
np.array_equal(ds1372,ds50)

False

In [None]:
np.array_equal(ds213,ds50)

False

## Generating ClinTox

In [None]:
#seed = 213
#seed= 1372
seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "ClinTox_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_clintox(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'clintox')
ds50 = datasets[0].ids

In [None]:
#seed = 213
seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "ClinTox_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_clintox(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'clintox')
ds1372 = datasets[0].ids

In [None]:
seed = 213
#seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "ClinTox_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_clintox(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'clintox')
ds213 = datasets[0].ids

In [None]:
np.array_equal(ds1372,ds213)

False

In [None]:
np.array_equal(ds1372,ds50)

False

In [None]:
np.array_equal(ds213,ds50)

False

### **Generating HIV**

In [None]:
#seed = 213
#seed= 1372
seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "HIV_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_hiv(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'hiv')
ds50 = datasets[0].ids

In [None]:
#seed = 213
seed= 1372
#seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "HIV_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_hiv(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'hiv')
ds1372 = datasets[0].ids

Random seed in this scaffold splitting is:  1372


In [None]:
seed = 213
#seed= 1372
seed=50
save_dir = os.path.join(dc.utils.get_data_dir(), "HIV_seed_" + str(seed))
tasks, datasets, transformers = dc.molnet.load_hiv(splitter=RandomScaffoldSplitter(seed=seed), transformers=['balancing'], save_dir=save_dir)
copy_fn_seeded_all(seed,'hiv')
ds213 = datasets[0].ids

Random seed in this scaffold splitting is:  213


In [None]:
np.array_equal(ds1372,ds213)

False

In [None]:
np.array_equal(ds1372,ds50)

False

In [None]:
np.array_equal(ds213,ds50)

False

In [None]:
copy_fn_seeded(seed)

In [None]:
!ls '/content/drive/MyDrive/GMLG_Research/Data/DeepChem/Tox21_seed_50/tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/'

ls: cannot access '/content/drive/MyDrive/GMLG_Research/Data/DeepChem/Tox21_seed_50/tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/': No such file or directory


In [None]:
copy_fn()

FileNotFoundError: ignored

In [None]:
ls '/content/drive/MyDrive/GMLG_Research/Data/DeepChem/Tox21_seed_50/tox21-featurized/CircularFingerprint_size_1024/'

[0m[01;34mRandomScaffoldSplitter_seed_50[0m/


In [None]:
ds213[-1]

'C#CCN1CC(=O)N(COC(=O)[C@@H]2[C@@H](C=C(C)C)C2(C)C)C1=O'

In [None]:
ds50[-1]

'CC1(C)S[C@@H]2[C@H](NC(=O)COc3ccccc3)C(=O)N2[C@H]1C(=O)[O-]'

In [None]:
ds50 == ds213

array([False, False, False, ..., False, False, False])

In [None]:
copy_fn()

In [None]:
ds213.shape

(6264,)

In [None]:
len(datasets[0])

In [None]:
tasks, datasets, transformers = dc.molnet.load_tox21(splitter=RandomScaffoldSplitter(seed=1000))

In [None]:
datasets[0].ids

In [None]:
tasks, datasets, transformers = dc.molnet.load_tox21(splitter=RandomScaffoldSplitter(seed=518))

In [None]:
datasets[0].ids

In [None]:
!ls '/content/drive/MyDrive/shaed'

 1709.03741.pdf
 1709.03741v2.pdf
 1805.08905.pdf
 1805.11973.pdf
 1812.01070.pdf
 1904.01561.pdf
 1907.11223.pdf
 1-s2.0-S1359644617303598-main.pdf
 2002.03230.pdf
 2002.03244.pdf
 2005.11856.pdf
 2005.13607v3.pdf
 2011.13042.pdf
 2012.04444v1.pdf
 2012.05716.pdf
 27.pdf
 684662.full.pdf
 AML_S20_Recitation
 Applications
'Arabic Handwritten Characters Dataset CSV'
 Arabic_Handwritten_Data
'Arabic Handwritten Digits Dataset CSV'
 Brochure.gdoc
'Colab Notebooks'
'conditional_generation_of_molecules_from_disentangled_representations-Original Pdf.pdf'
'Connectionists: CFP for PhD candidates in SPIKE NEURAL NETWORK FOR ATMOSPHERIC DATA MINING.gdoc'
'Copy of tox21-global-cdf-rdkit (1).zip'
'Copy of tox21-global-cdf-rdkit (2).zip'
'Copy of tox21-global-cdf-rdkit (3).zip'
'Copy of tox21-global-cdf-rdkit.zip'
 Covid-Data-by_Mafakher
'Deep Learning for the Life Sciences Applying Deep Learning to Genomics, Microscopy, Drug Discovery, and More by Bharath Ramsundar, Peter Eastman, Patrick Walters,

In [None]:
!ls '/content/drive/MyDrive/seed_213/tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer'

test_dir  train_dir  transformers.pkl  valid_dir


In [None]:
def copy():

  !cp os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy') save_dir
  !mv os.path.join(save_dir,'shard-0-ids.npy') train_set

  !cp os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/valid_dir/shard-0-ids.npy') save_dir
  !mv os.path.join(save_dir,'shard-0-ids.npy') valid_set

  !cp os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/test_dir/shard-0-ids.npy') save_dir
  !mv os.path.join(save_dir,'shard-0-ids.npy') test_set

In [None]:
copy()

/bin/bash: -c: line 0: syntax error near unexpected token `('
/bin/bash: -c: line 0: `cp os.path.join(save_dir,'/tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy') save_dir'


In [None]:
save_dir

'/content/drive/MyDrive/seed_213'

In [None]:
!cp '/content/drive/MyDrive/seed_213/tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy' '/content'

In [None]:
save_dir = '/content/drive/MyDrive/seed_213' 

In [None]:
b='/tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy'
a=os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy') 
a

'/content/drive/MyDrive/seed_213/tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy'

In [None]:
save_dir

'/content/drive/MyDrive/seed_213'

In [None]:

a = os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy')

b=str(a)

b


'/content/drive/MyDrive/seed_213/tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy'

In [None]:
!cp b '/content'

cp: cannot stat 'b': No such file or directory


In [None]:
!ls '/content/drive/MyDrive/seed_213/tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir'

metadata.csv.gzip  shard-0-w.npy  shard-0-y.npy
shard-0-ids.npy    shard-0-X.npy  tasks.json


In [None]:
!cp os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy') save_dir

/bin/bash: -c: line 0: syntax error near unexpected token `('
/bin/bash: -c: line 0: `cp os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir+shard-0-ids.npy') save_dir'


In [None]:
shutil.copyfile(os.path.join(save_dir,'tox21-featurized/CircularFingerprint_size_1024/RandomScaffoldSplitter/BalancingTransformer/train_dir/shard-0-ids.npy'),'/content')

IsADirectoryError: ignored

In [None]:
import shutil

In [None]:
pwd

'/content'

In [None]:
!mkdir -p '/content/drive/MyDrive/GMLG_Research/Data/DeepChem/'