In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, PandasTools
from mordred import Calculator, descriptors
from Mold2_pywrapper import Mold2
from padelpy import from_smiles
from tqdm import tqdm
import warnings

In [2]:
warnings.filterwarnings('ignore')

# Defining Functions

In [3]:
def save_df(df, name):
    df.to_csv(f"../../data/negative_datasets/negative_dataset_with_descriptors/{name}.csv", index=False)

# Importing notebooks

In [4]:
pdb = pd.read_csv("../../data/negative_datasets/cleaned_datasets/pdb_cleaned.csv")
tox = pd.read_csv("../../data/negative_datasets/cleaned_datasets/tox21_cleaned.csv")
zinc = pd.read_csv("../../data/negative_datasets/cleaned_datasets/zinc15_cleaned.csv")

In [5]:
pdb.head()

Unnamed: 0,clean_smiles
0,CC1(C)O[C@H]2[C@@H]3OS(=O)(=O)O[C@@H]3CO[C@@]2...
1,CNC(=O)c1cccc(C)c1Nc1nc(N2CCN(c3ccccc3Cl)CC2)n...
2,CCNC(=O)Nc1cc2c(-c3ccnc(C)c3)ccc(C)c2cn1
3,Clc1ccccc1Nc1ccnc(Nc2ccc(-c3nnn[nH]3)cc2)n1
4,CC1=C(CCC(=O)O)c2cc3[nH]c(cc4nc(cc5[nH]c(cc1n2...


In [6]:
tox.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles,stripped_salt_smiles,neutralized_smiles,inchi,duplicated
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,InChI=1S/C9H10N2O3S2/c1-2-14-6-3-4-7-8(5-6)15-...,False
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,CCN1C(=O)NC(c2ccccc2)C1=O,CCN1C(=O)NC(c2ccccc2)C1=O,InChI=1S/C11H12N2O2/c1-2-13-10(14)9(12-11(13)1...,False
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,InChI=1S/C20H32O/c1-3-20(21)13-11-18-17-9-8-14...,False
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,InChI=1S/C17H28N2O/c1-6-12-19(8-3)15(7-2)17(20...,False
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O,CC(O)(P(=O)(O)O)P(=O)(O)O,CC(O)(P(=O)(O)O)P(=O)(O)O,"InChI=1S/C2H8O7P2/c1-2(3,10(4,5)6)11(7,8)9/h3H...",True


In [7]:
zinc.head()

Unnamed: 0,clean_smiles
0,Cc1c(C(=O)Nc2ccccc2C(N)=O)nnn1-c1ccc2ncccc2c1
1,C[C@H]1CCCC[C@H]1NC(=O)NC(=O)CN1CCc2ccc([N+](=...
2,Brc1ccccc1-c1nc2c3ccccc3ncn2n1
3,CC[C@H](C)NC(=O)Nc1ccnn1C1CCCC1
4,CCn1c(SCc2cc(C(=O)OC)c(C)o2)nnc1-c1ccccc1OC


# Adding Mol Column

In [8]:
PandasTools.AddMoleculeColumnToFrame(pdb, smilesCol='clean_smiles', molCol='ROMol')
PandasTools.AddMoleculeColumnToFrame(tox, smilesCol='neutralized_smiles', molCol='ROMol')
PandasTools.AddMoleculeColumnToFrame(zinc, smilesCol='clean_smiles', molCol='ROMol')

# Calculate Parameters

## Define Function

In [9]:
def calculate_molecular_descriptors(df: pd.DataFrame, smiles_col: str, dataset_name: str):
    # save fucntion
    def save_df(df, name):
        df.to_csv(f"../../data/negative_datasets/negative_dataset_with_descriptors/{name}.csv", index=False)

    # adding mol column
    PandasTools.AddMoleculeColumnToFrame(df, smilesCol=smiles_col, molCol='ROMol')

    ##### RDKIT #####

    ##### RDKIT FINGERPRINTS #####

    print("Starting RDKit Fingerprints")

    # Function to generate RDKit fingerprints
    def generate_fingerprint_rdkit(smiles):
        mol = Chem.MolFromSmiles(smiles)
        fingerprint = Chem.RDKFingerprint(mol)
        return fingerprint

    # Function to generate the fingerprints and add each one to a new column
    df_rdkit_fingerprint = df.copy()
    fingerprints = df_rdkit_fingerprint[smiles_col].apply(generate_fingerprint_rdkit)
    num_bits = fingerprints.iloc[0].GetNumBits()

    for i in range(num_bits):
        col_name = f'Bit_{i}'
        df_rdkit_fingerprint[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
    
    # save df
    save_df(df_rdkit_fingerprint, name=f"{dataset_name}_fingerprint_rdkit")

    print("Finished RDKit Fingerprints")
    
    ##### MORGAN FINGERPRINT #####

    print("Starting Morgan Fingerprints")

    # function to calculate morgan fingerprints
    def generate_fingerprint_morgan(smiles):
        mol = Chem.MolFromSmiles(smiles)
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2)  # Change radius as per your requirement
        return fingerprint
    
    df_morgan_fingerprint = df.copy()
    
    # calculate fingerprints and add each to a column
    # Generate fingerprints and create new columns for each bit
    fingerprints = df_morgan_fingerprint[smiles_col].apply(generate_fingerprint_morgan)
    num_bits = fingerprints.iloc[0].GetNumBits()

    for i in range(num_bits):
        col_name = f'Bit_{i}'
        df_morgan_fingerprint[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
    
    # save df
    save_df(df_morgan_fingerprint, name=f"{dataset_name}_fingerprint_morgan")

    print("Finished Morgan Fingerprints")

    ##### RDKIT DESCRIPTORS #####

    print("Starting RDKit Descriptors")

    # function to calculate descriptors
    def calculate_descriptors(smiles):
        mol = Chem.MolFromSmiles(smiles)

        descriptors = {}
        for descriptor_name, descriptor_function in Descriptors.descList:
            try:
                descriptors[descriptor_name] = descriptor_function(mol)
            except:
                descriptors[descriptor_name] = None

        return pd.Series(descriptors)

    # calculating the descriptors
    df_rdkit_descriptors = df.copy()
    rdkit_descriptors = df_rdkit_descriptors[smiles_col].apply(calculate_descriptors)
    df_rdkit_descriptors = pd.concat([df_rdkit_descriptors, rdkit_descriptors], axis=1)

    # save dataset
    save_df(df_rdkit_descriptors, name=f"{dataset_name}_rdkit")

    print("Finished RDKit Descriptors")

    ##### MORDRED #####

    print("Starting Mordred")

    df_mordred = df.copy()

    # initializing object for mordred descriptors calculation, for only 2d descriptors, and both 2d and 3d descriptors
    calc_2d = Calculator(descriptors, ignore_3D=True)
    calc_2d_3d = Calculator(descriptors, ignore_3D=False)

    # calculating descriptors
    mordred_descriptors_2d = calc_2d.pandas(df_mordred["ROMol"])
    mordred_descriptors_2d_3d = calc_2d_3d.pandas(df_mordred["ROMol"])

    # concating to original dataset
    df_mordred_2d = pd.concat([df_mordred, mordred_descriptors_2d], axis=1)
    df_mordred_2d_3d = pd.concat([df_mordred, mordred_descriptors_2d_3d], axis=1)

    # saving datasets
    save_df(df_mordred_2d, name=f"{dataset_name}_mordred_2d")
    save_df(df_mordred_2d_3d, name=f"{dataset_name}_mordred_2d_and_3d")

    print("Finished Mordred")

    ##### MOLD 2 #####

    print("Starting Mold 2")

    df_mold2 = df.copy()

    # initializing object
    mold2 = Mold2()

    # calculating descriptors
    mold2_descriptors = mold2.calculate(df_mold2["ROMol"])

    # concat
    df_mold2 = pd.concat([df_mold2, mold2_descriptors], axis=1)

    # save df
    save_df(df_mold2, name=f"{dataset_name}_mold2")

    print("Finished Mold 2")

    ##### PADEL #####

    print("Starting Padel")

    df_padel = df.copy()

    # smiles list
    smiles = df_padel[smiles_col].tolist()

    # calculating the descriptors and saving to a list
    # molecules with descriptors that can't be calculated are saved in the list error
    correct_descriptors = []
    error = []
    for smi in tqdm(smiles):
        try:
            desc = from_smiles(smi)
            desc[smiles_col] = smi
            correct_descriptors.append(desc)
        except:
            error.append(smi)

    # adding molecules to dataframe
    descriptors_df = pd.DataFrame.from_records(correct_descriptors)
    df_padel = pd.merge(df_padel, descriptors_df, on=smiles_col, how="outer")

    # save df
    save_df(df_padel, name=f"{dataset_name}_padel")

    print("Finished Padel")
    

## Run Calculation

In [46]:
calculate_molecular_descriptors(
    df=pdb,
    smiles_col="clean_smiles",
    dataset_name="pdb"
)

Starting RDKit Fingerprints
Finished RDKit Fingerprints
Starting Morgan Fingerprints
Finished Morgan Fingerprints
Starting RDKit Descriptors
Finished RDKit Descriptors
Starting Mordred


  2%|▏         | 247/12246 [00:38<19:17, 10.36it/s] 

[18:17:45] Explicit valence for atom # 29 C greater than permitted


  5%|▍         | 563/12246 [01:23<43:43,  4.45it/s]  

[18:18:28] Explicit valence for atom # 20 C greater than permitted


 10%|▉         | 1207/12246 [03:09<23:29,  7.83it/s]  

[18:20:17] Explicit valence for atom # 30 C greater than permitted


 11%|█         | 1295/12246 [03:26<50:21,  3.62it/s]  

[18:20:31] Explicit valence for atom # 24 C greater than permitted


 12%|█▏        | 1463/12246 [03:55<18:47,  9.56it/s]  

[18:21:03] Explicit valence for atom # 3 C greater than permitted


 14%|█▍        | 1712/12246 [04:42<25:26,  6.90it/s]  

[18:21:48] Explicit valence for atom # 4 C greater than permitted


 18%|█▊        | 2212/12246 [06:14<27:39,  6.05it/s]  

[18:23:22] Explicit valence for atom # 1 C greater than permitted


 19%|█▉        | 2300/12246 [06:33<35:28,  4.67it/s]  

[18:23:41] Explicit valence for atom # 3 C greater than permitted


 19%|█▉        | 2377/12246 [06:51<34:51,  4.72it/s]  

[18:23:57] Explicit valence for atom # 1 C greater than permitted


 24%|██▍       | 2986/12246 [08:40<14:06, 10.94it/s]  

[18:25:48] Explicit valence for atom # 2 C greater than permitted


 26%|██▋       | 3229/12246 [09:25<32:37,  4.61it/s]  

[18:26:31] Explicit valence for atom # 4 C greater than permitted


 26%|██▋       | 3231/12246 [09:25<29:11,  5.15it/s]

[18:26:31] Explicit valence for atom # 3 C greater than permitted


 27%|██▋       | 3339/12246 [09:45<47:53,  3.10it/s]

[18:26:51] Explicit valence for atom # 1 C greater than permitted


 32%|███▏      | 3867/12246 [11:11<20:56,  6.67it/s]

[18:28:18] Explicit valence for atom # 1 C greater than permitted


 35%|███▍      | 4235/12246 [12:25<49:48,  2.68it/s]  

[18:29:31] Explicit valence for atom # 13 C greater than permitted


 38%|███▊      | 4708/12246 [13:51<28:01,  4.48it/s]

[18:30:57] Explicit valence for atom # 1 C greater than permitted


 39%|███▉      | 4762/12246 [13:59<11:19, 11.02it/s]

[18:31:07] Explicit valence for atom # 26 C greater than permitted


 42%|████▏     | 5104/12246 [14:57<27:24,  4.34it/s]

[18:32:04] Explicit valence for atom # 28 C greater than permitted


 49%|████▉     | 6031/12246 [17:37<12:24,  8.35it/s]  

[18:34:44] Explicit valence for atom # 1 C greater than permitted


 57%|█████▋    | 6974/12246 [20:34<07:49, 11.24it/s]

[18:37:41] Explicit valence for atom # 8 C greater than permitted


 60%|█████▉    | 7319/12246 [21:39<20:03,  4.09it/s]

[18:38:45] Explicit valence for atom # 23 C greater than permitted


 60%|██████    | 7403/12246 [21:52<20:18,  3.97it/s]

[18:39:00] Explicit valence for atom # 11 C greater than permitted


 61%|██████    | 7438/12246 [21:58<16:47,  4.77it/s]

[18:39:05] Explicit valence for atom # 11 C greater than permitted


 62%|██████▏   | 7598/12246 [22:27<14:17,  5.42it/s]

[18:39:34] Explicit valence for atom # 17 C greater than permitted


 65%|██████▌   | 8008/12246 [23:44<18:25,  3.83it/s]

[18:40:47] Explicit valence for atom # 3 C greater than permitted


 67%|██████▋   | 8252/12246 [24:22<23:23,  2.85it/s]

[18:41:30] Explicit valence for atom # 3 C greater than permitted


 68%|██████▊   | 8299/12246 [24:27<06:05, 10.81it/s]

[18:41:36] Explicit valence for atom # 1 C greater than permitted


 69%|██████▉   | 8420/12246 [24:48<10:40,  5.97it/s]

[18:41:56] Explicit valence for atom # 1 C greater than permitted


 70%|██████▉   | 8530/12246 [25:09<10:16,  6.03it/s]

[18:42:13] Explicit valence for atom # 16 C greater than permitted


 74%|███████▍  | 9120/12246 [26:57<10:42,  4.86it/s]

[18:44:00] Explicit valence for atom # 22 C greater than permitted


 82%|████████▏ | 9982/12246 [29:47<08:37,  4.37it/s]

[18:46:52] Explicit valence for atom # 3 C greater than permitted
[18:46:52] Explicit valence for atom # 1 C greater than permitted


 84%|████████▎ | 10254/12246 [30:30<03:29,  9.50it/s]

[18:47:38] Explicit valence for atom # 5 C greater than permitted


 85%|████████▌ | 10412/12246 [30:55<06:40,  4.58it/s]

[18:48:01] Explicit valence for atom # 1 C greater than permitted


 85%|████████▌ | 10458/12246 [31:04<04:38,  6.42it/s]

[18:48:12] Explicit valence for atom # 1 C greater than permitted


 86%|████████▌ | 10475/12246 [31:07<05:07,  5.75it/s]

[18:48:15] Explicit valence for atom # 7 C greater than permitted


 86%|████████▌ | 10514/12246 [31:18<09:14,  3.12it/s]

[18:48:22] Explicit valence for atom # 7 C greater than permitted


 87%|████████▋ | 10715/12246 [31:52<03:45,  6.80it/s]

[18:48:59] Explicit valence for atom # 3 C greater than permitted


 88%|████████▊ | 10766/12246 [32:01<02:54,  8.50it/s]

[18:49:09] Explicit valence for atom # 15 C greater than permitted


 88%|████████▊ | 10783/12246 [32:08<11:16,  2.16it/s]

[18:49:12] Explicit valence for atom # 25 C greater than permitted


 89%|████████▉ | 10949/12246 [32:36<04:01,  5.38it/s]

[18:49:42] Explicit valence for atom # 1 C greater than permitted


 94%|█████████▎| 11474/12246 [34:07<01:38,  7.82it/s]

[18:51:14] Explicit valence for atom # 20 C greater than permitted


 94%|█████████▎| 11480/12246 [34:09<02:36,  4.89it/s]

[18:51:16] Explicit valence for atom # 1 C greater than permitted


 94%|█████████▍| 11549/12246 [34:26<01:58,  5.88it/s]

[18:51:33] Explicit valence for atom # 26 C greater than permitted


100%|██████████| 12246/12246 [36:17<00:00,  5.62it/s]
  2%|▏         | 248/12246 [00:41<18:10, 11.01it/s]  

[18:54:16] Explicit valence for atom # 29 C greater than permitted


  5%|▍         | 563/12246 [01:32<50:54,  3.82it/s]  

[18:55:07] Explicit valence for atom # 20 C greater than permitted


 10%|▉         | 1207/12246 [03:25<17:48, 10.33it/s]  

[18:57:00] Explicit valence for atom # 30 C greater than permitted


 11%|█         | 1303/12246 [03:43<32:50,  5.55it/s]  

[18:57:15] Explicit valence for atom # 24 C greater than permitted


 12%|█▏        | 1462/12246 [04:15<20:52,  8.61it/s]  

[18:57:51] Explicit valence for atom # 3 C greater than permitted


 14%|█▍        | 1712/12246 [07:08<1:56:09,  1.51it/s] 

[18:59:34] Explicit valence for atom # 4 C greater than permitted


 18%|█▊        | 2213/12246 [10:23<22:04,  7.57it/s]   

[19:03:57] Explicit valence for atom # 1 C greater than permitted


 19%|█▉        | 2301/12246 [10:41<17:00,  9.74it/s]  

[19:04:16] Explicit valence for atom # 3 C greater than permitted


 19%|█▉        | 2377/12246 [10:56<36:05,  4.56it/s]  

[19:04:30] Explicit valence for atom # 1 C greater than permitted


 24%|██▍       | 2985/12246 [12:50<16:53,  9.14it/s]  

[19:06:26] Explicit valence for atom # 2 C greater than permitted


 26%|██▋       | 3229/12246 [13:36<27:44,  5.42it/s]  

[19:07:10] Explicit valence for atom # 4 C greater than permitted


 26%|██▋       | 3232/12246 [13:39<46:56,  3.20it/s]

[19:07:11] Explicit valence for atom # 3 C greater than permitted


 27%|██▋       | 3346/12246 [13:59<22:59,  6.45it/s]

[19:07:32] Explicit valence for atom # 1 C greater than permitted


 32%|███▏      | 3868/12246 [15:24<18:14,  7.66it/s]  

[19:08:58] Explicit valence for atom # 1 C greater than permitted


 35%|███▍      | 4237/12246 [16:36<29:41,  4.50it/s]  

[19:10:10] Explicit valence for atom # 13 C greater than permitted


 38%|███▊      | 4712/12246 [18:02<19:34,  6.42it/s]  

[19:11:36] Explicit valence for atom # 1 C greater than permitted


 39%|███▉      | 4763/12246 [18:10<14:11,  8.79it/s]

[19:11:45] Explicit valence for atom # 26 C greater than permitted


 42%|████▏     | 5105/12246 [19:08<20:29,  5.81it/s]

[19:12:43] Explicit valence for atom # 28 C greater than permitted


 49%|████▉     | 6031/12246 [25:31<05:34, 18.57it/s]  

[19:19:06] Explicit valence for atom # 1 C greater than permitted


 57%|█████▋    | 6966/12246 [30:45<09:25,  9.34it/s]   

[19:24:20] Explicit valence for atom # 8 C greater than permitted


 60%|█████▉    | 7327/12246 [31:55<12:07,  6.76it/s]  

[19:25:27] Explicit valence for atom # 23 C greater than permitted


 60%|██████    | 7403/12246 [32:07<08:45,  9.22it/s]

[19:25:42] Explicit valence for atom # 11 C greater than permitted


 61%|██████    | 7438/12246 [32:13<11:39,  6.88it/s]

[19:25:47] Explicit valence for atom # 11 C greater than permitted


 62%|██████▏   | 7598/12246 [32:45<12:22,  6.26it/s]

[19:26:20] Explicit valence for atom # 17 C greater than permitted


 65%|██████▌   | 8005/12246 [34:05<19:22,  3.65it/s]

[19:27:37] Explicit valence for atom # 3 C greater than permitted


 67%|██████▋   | 8254/12246 [34:46<06:14, 10.66it/s]

[19:28:21] Explicit valence for atom # 3 C greater than permitted


 68%|██████▊   | 8299/12246 [34:52<05:26, 12.10it/s]

[19:28:28] Explicit valence for atom # 1 C greater than permitted


 69%|██████▉   | 8420/12246 [35:14<09:43,  6.56it/s]

[19:28:49] Explicit valence for atom # 1 C greater than permitted


 70%|██████▉   | 8527/12246 [35:38<22:17,  2.78it/s]

[19:29:08] Explicit valence for atom # 16 C greater than permitted


 74%|███████▍  | 9117/12246 [37:32<15:48,  3.30it/s]

[19:31:01] Explicit valence for atom # 22 C greater than permitted


 82%|████████▏ | 9987/12246 [41:36<19:11,  1.96it/s]  

[19:34:09] Explicit valence for atom # 3 C greater than permitted
[19:34:09] Explicit valence for atom # 1 C greater than permitted


 84%|████████▎ | 10254/12246 [51:46<02:52, 11.54it/s]   

[19:45:22] Explicit valence for atom # 5 C greater than permitted


 85%|████████▌ | 10416/12246 [54:51<33:05,  1.08s/it]  

[19:45:52] Explicit valence for atom # 1 C greater than permitted


 85%|████████▌ | 10460/12246 [54:59<03:07,  9.52it/s]

[19:48:35] Explicit valence for atom # 1 C greater than permitted


 86%|████████▌ | 10481/12246 [55:04<02:53, 10.15it/s]

[19:48:37] Explicit valence for atom # 7 C greater than permitted


 86%|████████▌ | 10523/12246 [55:14<05:22,  5.34it/s]

[19:48:44] Explicit valence for atom # 7 C greater than permitted


 87%|████████▋ | 10714/12246 [55:48<02:48,  9.10it/s]

[19:49:24] Explicit valence for atom # 3 C greater than permitted


 88%|████████▊ | 10770/12246 [55:57<01:41, 14.56it/s]

[19:49:33] Explicit valence for atom # 15 C greater than permitted


 88%|████████▊ | 10787/12246 [56:04<05:20,  4.55it/s]

[19:49:36] Explicit valence for atom # 25 C greater than permitted


 89%|████████▉ | 10953/12246 [56:33<01:32, 14.04it/s]

[19:50:08] Explicit valence for atom # 1 C greater than permitted


 94%|█████████▎| 11476/12246 [58:10<01:17,  9.88it/s]

[19:51:44] Explicit valence for atom # 20 C greater than permitted


 94%|█████████▍| 11485/12246 [58:14<02:48,  4.52it/s]

[19:51:47] Explicit valence for atom # 1 C greater than permitted


 94%|█████████▍| 11559/12246 [58:31<00:52, 13.06it/s]

[19:52:06] Explicit valence for atom # 26 C greater than permitted


100%|██████████| 12246/12246 [1:00:28<00:00,  3.37it/s]


Finished Mordred
Starting Mold 2
Mold2 calculates a large and diverse set of molecular descriptors encoding two-
dimensional chemical structure information. Comparative analysis of Mold2 descriptors
with those calculated from commercial software on several published datasets
demonstrated that Mold2 descriptors convey sufficient structural information. In addition,
better models were generated using Mold2 descriptors than the compared commercial
software packages. This publicly available software is developed by the Center for
Bioinformatics, which is led by Dr. Weida Tong, at the National Center for Toxicological
Research (NCTR).
    
Mold2 is a product designed and produced by the National Center for Toxicological
Research (NCTR).  FDA and NCTR retain ownership of this product.

Please address any questions or suggestions to Dr. Huixiao Hong, National Center for Toxicological
Research, at 870-543-7296 or Huixiao.Hong@fda.hhs.gov.

###################################

Should you publis

100%|██████████| 12246/12246 [38:17:37<00:00, 11.26s/it]   


Finished Padel


In [10]:
calculate_molecular_descriptors(
    df=tox,
    smiles_col="neutralized_smiles",
    dataset_name="tox"
)

Starting RDKit Fingerprints
Finished RDKit Fingerprints
Starting Morgan Fingerprints
Finished Morgan Fingerprints
Starting RDKit Descriptors
Finished RDKit Descriptors
Starting Mordred


100%|██████████| 7506/7506 [06:48<00:00, 18.37it/s]
100%|██████████| 7506/7506 [08:33<00:00, 14.62it/s]  


Finished Mordred
Starting Mold 2
Mold2 calculates a large and diverse set of molecular descriptors encoding two-
dimensional chemical structure information. Comparative analysis of Mold2 descriptors
with those calculated from commercial software on several published datasets
demonstrated that Mold2 descriptors convey sufficient structural information. In addition,
better models were generated using Mold2 descriptors than the compared commercial
software packages. This publicly available software is developed by the Center for
Bioinformatics, which is led by Dr. Weida Tong, at the National Center for Toxicological
Research (NCTR).
    
Mold2 is a product designed and produced by the National Center for Toxicological
Research (NCTR).  FDA and NCTR retain ownership of this product.

Please address any questions or suggestions to Dr. Huixiao Hong, National Center for Toxicological
Research, at 870-543-7296 or Huixiao.Hong@fda.hhs.gov.

###################################

Should you publis

100%|██████████| 7506/7506 [13:54:10<00:00,  6.67s/it]   


Finished Padel


In [11]:
calculate_molecular_descriptors(
    df=zinc,
    smiles_col="clean_smiles",
    dataset_name="zinc"
)

Starting RDKit Fingerprints
Finished RDKit Fingerprints
Starting Morgan Fingerprints
Finished Morgan Fingerprints
Starting RDKit Descriptors
Finished RDKit Descriptors
Starting Mordred


100%|██████████| 9971/9971 [08:28<00:00, 19.61it/s]
100%|██████████| 9971/9971 [13:40<00:00, 12.15it/s]   


Finished Mordred
Starting Mold 2
Mold2 calculates a large and diverse set of molecular descriptors encoding two-
dimensional chemical structure information. Comparative analysis of Mold2 descriptors
with those calculated from commercial software on several published datasets
demonstrated that Mold2 descriptors convey sufficient structural information. In addition,
better models were generated using Mold2 descriptors than the compared commercial
software packages. This publicly available software is developed by the Center for
Bioinformatics, which is led by Dr. Weida Tong, at the National Center for Toxicological
Research (NCTR).
    
Mold2 is a product designed and produced by the National Center for Toxicological
Research (NCTR).  FDA and NCTR retain ownership of this product.

Please address any questions or suggestions to Dr. Huixiao Hong, National Center for Toxicological
Research, at 870-543-7296 or Huixiao.Hong@fda.hhs.gov.

###################################

Should you publis

100%|██████████| 9971/9971 [15:40:24<00:00,  5.66s/it]   


Finished Padel
