## Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
from sklearn.decomposition import PCA

from rdkit.Chem import Draw, Descriptors, rdMolDescriptors

In [9]:
HERE = Path(_dh[-1])
DATA_PATH = HERE.parent.parent.parent.parent.parent/'masters-data'/'pre_processed_datasets'

drugs_and_tox_21 = pd.read_csv(DATA_PATH/'pre_processed_tox_mold2.csv')
drugs_and_tox_21

  drugs_and_tox_21 = pd.read_csv(DATA_PATH/'pre_processed_tox_mold2.csv')


Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,2.00000,3.493810,0.000000,-1.457570
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,2.32193,0.523826,0.000000,-0.816018
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,2.32193,1.814530,0.000000,-1.453400
3,CYSTEAMINE,CHEMBL602,NCCS,1994,CYSTEAMINE HYDROCHLORIDE,Small molecule,False,True,False,False,...,0.0,1.0,0.0,0.0,0.0,0.0,0.00000,1.832480,0.000000,-0.172449
4,DIMETHYL SULFOXIDE,CHEMBL504,C[S+](C)[O-],1978,Anti-Inflammatory (topical),Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,-0.430677,0.000000,-0.172449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9396,,,,,,,,,,,...,2.0,0.0,0.0,0.0,0.0,0.0,5.64386,-0.319569,0.486486,4.133220
9397,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,4.85798,-0.900816,0.000000,4.961530
9398,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,4.70044,-0.427556,0.000000,4.607770
9399,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,4.70044,0.219736,0.260870,3.634690


In [10]:
tox_flag_columns = [
    "NR-AR",
    "NR-AR-LBD",
    "NR-AhR",
    "NR-Aromatase",
    "NR-ER",
    "NR-ER-LBD",
    "NR-PPAR-gamma",
    "SR-ARE",
    "SR-ATAD5",
    "SR-HSE",
    "SR-MMP",
    "SR-p53",
    "mol_id",
    "smiles",
    "stripped_salt_smiles",
    "neutralized_smiles",
    "inchi",
    "duplicated",
]

drugs_flag_columns = [
    "name",
    "chembl_id",
    "clean_smiles",
    "first_approval_year",
    "indication_class",
    "molecule_type",
    "withdrawn_flag",
    "therapeutic_flag",
    "polymer_flag",
    "inorganic_flag",
    "natural_product_flag",
    "parenteral",
    "topical",
    "oral",
]

In [11]:
drugs_and_tox_21[tox_flag_columns]

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles,stripped_salt_smiles,neutralized_smiles,inchi,duplicated
0,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9396,,,,,,,,0.0,,0.0,,,TOX2725,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,CCOc1nc2cccc(C(=O)O)c2n1Cc1ccc(-c2ccccc2-c2nnn...,InChI=1S/C24H20N6O3/c1-2-33-24-25-20-9-5-8-19(...,False
9397,1.0,1.0,0.0,0.0,1.0,0.0,,,0.0,0.0,,0.0,TOX2370,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,CC(=O)[C@H]1CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(...,InChI=1S/C21H30O2/c1-13(22)17-6-7-18-16-5-4-14...,False
9398,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX2371,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,C[C@]12CC[C@H]3[C@@H](CCC4=CC(=O)CC[C@@]43C)[C...,InChI=1S/C19H28O2/c1-18-9-7-13(20)11-12(18)3-4...,False
9399,1.0,1.0,0.0,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,TOX2377,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C...,InChI=1S/C18H24O2/c1-18-9-8-14-13-5-3-12(19)10...,False
