In [None]:
import os
import pandas as pd

if not os.path.exists("Datasets"):
    os.mkdir("Datasets")

def process_dataset(url,name=None,rename_cols=None, save_cols=None):
    """
    Processes and saves a dataset given a url and a set of parameters
    
    Parameters
    ----------
    url : str
        The url of the dataset of interest
        
    name : str, default=None
        The name that will be used to save the csv file. Defaults to the url filename if no alternative
        is provided.
        
    rename_cols : dict, default=None
        A dictionary of n ames to replace in the pandas dataframe. Uses the pandas default syntax of 
        {old_name : new_name}
    
    save_cols : list, default=None
        The columns to save from the dictionary
    """
    if name is None:
        name = url.split('/')[-1] # Take the last part of the url as the default name
    
    df = pd.read_csv(url)
    if rename_cols is not None:
        df.rename(columns = rename_cols, inplace=True)
    if save_cols is None:
        df.to_csv("Datasets/" + name, index=False)
    else:
        df[save_cols].to_csv("Datasets/" + name, index=False)
    
    
datasets = [{
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv",
                "rename_cols":{"smiles" : "SMILES"},
                "save_cols":["SMILES","p_np"]
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm9.csv",
                "rename_cols":{"smiles":"SMILES",
                               "A" : "Rotational Constant A",
                               "B" : "Rotational Constant B",
                               "C" : "Rotational Constant C",
                               "mu" : "Dipole Moment",
                               "alpha" : "Isotropic Polarizability",
                               "homo" : "Energy of HOMO",
                               "lumo" : "Energy of LUMO",
                               "gap" : "HOMO-LUMO Gap",
                               "r2" : "Electronic Spatial Extent",
                               "zpve" : "ZPVE",
                               "u0" : "Internal Energy at 0K",
                               "u298" : "Internal Energy at 298.15K",
                               "h298" : "Enthalpy at 298.15K",
                               "g298" : "Free Energy at 298.15K",
                               "cv" : "Heat Capacity at 298.15K"},
                "save_cols":['SMILES',
                             'Rotational Constant A',
                             'Rotational Constant B',
                             'Rotational Constant C',
                             'Dipole Moment',
                             'Isotropic Polarizability',
                             'Energy of HOMO',
                             'Energy of LUMO',
                             'HOMO-LUMO Gap',
                             'Electronic Spatial Extent',
                             'ZPVE',
                             'Internal Energy at 0K',
                             'Internal Energy at 298.15K',
                             'Enthalpy at 298.15K',
                             'Free Energy at 298.15K',
                             'Heat Capacity at 298.15K']
            }
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/SAMPL.csv",
                "name":"FreeSolv.csv",
                "rename_cols":{"smiles":"SMILES"},
                "save_cols":["SMILES","expt","calc"]
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv",
                "rename_cols":{"smiles":"SMILES"},
                "save_cols":["SMILES","exp"]
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv",
                "name":"BACE.csv",
                "rename_cols":{"mol":"SMILES"},
                "save_cols":["SMILES","pIC50"]
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv",
                "name":"QM8.csv",
                "rename_cols":{"smiles":"SMILES"}
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm7.csv",
                "name":"QM7.csv",
                "rename_cols":{"smiles":"SMILES"}
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz",
                "name":"TOX21.csv",
                "rename_cols":{"smiles":"SMILES"}
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/sider.csv.gz",
                "name":"SIDER.csv",
                "rename_cols":{"smiles":"SMILES"}
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/clintox.csv.gz",
                "name":"Clintox.csv",
                "rename_cols":{"smiles":"SMILES"}
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/thermosol.csv",
                "name":"Thermosol.csv",
                "rename_cols":{"smile":"SMILES",
                               "target":"Solubility"},
                "save_cols":["SMILES","Solubility"]
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/hopv.tar.gz",
                "name":"Harvard_Organic_Photovoltaic.csv",
                "rename_cols":{"hopv.csv":"SMILES"},
                "save_cols":['SMILES',"HOMO","LUMO","electrochemical_gap","optical_gap","PCE","V_OC","J_SC","fill_factor"]
            },
            {
                "url":"https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/delaney-processed.csv",
                "name":"ESOL.csv",
                "rename_cols":{"smiles":"SMILES"}
            }
] 

for dataset_args in datasets:
    process_dataset(**dataset_args)

    


In [49]:
import requests
import tarfile

if not os.path.exists("Datasets/QM8"):
    os.mkdir("Datasets/QM8")
    
url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz"
response = requests.get(url, stream=True)
file = tarfile.open(fileobj=response.raw, mode="r|gz")
file.extractall(path="Datasets/QM8")

if not os.path.exists("Datasets/QM7b"):
    os.mkdir("Datasets/QM7b")

url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb7.tar.gz"
response = requests.get(url, stream=True)
file = tarfile.open(fileobj=response.raw, mode="r|gz")
file.extractall(path="Datasets/QM7b")

if not os.path.exists("Datasets/PQM9"):
    os.mkdir("Datasets/QM9")

url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb9.tar.gz"
response = requests.get(url, stream=True)
file = tarfile.open(fileobj=response.raw, mode="r|gz")
file.extractall(path="Datasets/QM9")


ModuleNotFoundError: No module named 'requests'

In [48]:
import os 
import requests

if not os.path.exists("Datasets/QM9"):
    os.mkdir("Datasets/QM9")

url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb9.tar.gz"
response = requests.get(url, stream=True)
file = tarfile.open(fileobj=response.raw, mode="r|gz")
file.extractall(path="Datasets/QM9")

ModuleNotFoundError: No module named 'requests'

In [27]:
import numpy as np
from scipy.io import loadmat  # this is the SciPy module that loads mat-files
import pandas as pd

mat = loadmat('Datasets/QM7b/qm7b.mat')

In [3]:
import pandas as pd
test = pd.read_csv("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm9.csv")

In [42]:
goods = list(qm9_df)
del goods[1]
goods

['SMILES',
 'Rotational Constant A',
 'Rotational Constant B',
 'Rotational Constant C',
 'Dipole Moment',
 'Isotropic Polarizability',
 'Energy of HOMO',
 'Energy of LUMO',
 'HOMO-LUMO Gap',
 'Electronic Spatial Extent',
 'ZPVE',
 'Internal Energy at 0K',
 'Internal Energy at 298.15K',
 'Enthalpy at 298.15K',
 'Free Energy at 298.15K',
 'Heat Capacity at 298.15K']

In [17]:
list(test)[2:]

['A',
 'B',
 'C',
 'mu',
 'alpha',
 'homo',
 'lumo',
 'gap',
 'r2',
 'zpve',
 'u0',
 'u298',
 'h298',
 'g298',
 'cv',
 'u0_atom',
 'u298_atom',
 'h298_atom',
 'g298_atom']

In [8]:
qm9_df = pd.read_csv("Datasets/QM9/QM9_Data.csv")

In [21]:
print(list(qm9_df))
print(list(test))

['SMILES', 'Canonical SMILES', 'Rotational Constant A', 'Rotational Constant B', 'Rotational Constant C', 'Dipole Moment', 'Isotropic Polarizability', 'Energy of HOMO', 'Energy of LUMO', 'HOMO-LUMO Gap', 'Electronic Spatial Extent', 'ZPVE', 'Internal Energy at 0K', 'Internal Energy at 298.15K', 'Enthalpy at 298.15K', 'Free Energy at 298.15K', 'Heat Capacity at 298.15K']
['mol_id', 'smiles', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'u0', 'u298', 'h298', 'g298', 'cv', 'u0_atom', 'u298_atom', 'h298_atom', 'g298_atom']


In [18]:
qm9_df

Unnamed: 0,SMILES,Canonical SMILES,Rotational Constant A,Rotational Constant B,Rotational Constant C,Dipole Moment,Isotropic Polarizability,Energy of HOMO,Energy of LUMO,HOMO-LUMO Gap,Electronic Spatial Extent,ZPVE,Internal Energy at 0K,Internal Energy at 298.15K,Enthalpy at 298.15K,Free Energy at 298.15K,Heat Capacity at 298.15K
0,C,C,157.71180,157.709970,157.706990,0.0000,13.21,-0.3877,0.1171,0.5048,35.3641,0.044749,-40.478930,-40.476062,-40.475117,-40.498597,6.469
1,N,N,293.60975,293.541110,191.393970,1.6256,9.46,-0.2570,0.0829,0.3399,26.1563,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316
2,O,O,799.58812,437.903860,282.945450,1.8511,6.31,-0.2928,0.0687,0.3615,19.0002,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002
3,C#C,C#C,0.00000,35.610036,35.610036,0.0000,16.28,-0.2845,0.0506,0.3351,59.5248,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574
4,C#N,C#N,0.00000,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,48.7476,0.016601,-93.411888,-93.409370,-93.408425,-93.431246,6.278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133880,C1C2C3C4C5OC14C5N23,C1[C@@H]2[C@@H]3[C@H]4[C@@H]5O[C@]14[C@@H]5N23,3.59483,2.198990,1.904230,1.6637,69.37,-0.2254,0.0588,0.2842,760.7472,0.127406,-400.633868,-400.628599,-400.627654,-400.663098,23.658
133881,C1N2C3C2C2C4OC12C34,C1N2[C@H]3[C@@H]2[C@@H]2[C@H]4O[C@@]12[C@@H]34,3.65648,2.142370,1.904390,1.2976,69.52,-0.2393,0.0608,0.3002,762.6354,0.127495,-400.629713,-400.624444,-400.623500,-400.658942,23.697
133882,C1N2C3C4C5C2C13CN45,C1N2[C@@H]3[C@@H]4[C@@H]5[C@H]2[C@]13CN45,3.67118,2.143140,1.895010,1.2480,73.60,-0.2233,0.0720,0.2953,780.3553,0.140458,-380.753918,-380.748619,-380.747675,-380.783148,23.972
133883,C1N2C3C4C5CC13C2C45,C1N2[C@H]3[C@@H]4[C@@H]5C[C@]13[C@H]2[C@H]45,3.52845,2.151310,1.865820,1.9576,77.40,-0.2122,0.0881,0.3003,803.1904,0.152222,-364.720374,-364.714974,-364.714030,-364.749650,24.796


In [19]:
test

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,zpve,u0,u298,h298,g298,cv,u0_atom,u298_atom,h298_atom,g298_atom
0,gdb_1,C,157.71180,157.709970,157.706990,0.0000,13.21,-0.3877,0.1171,0.5048,...,0.044749,-40.478930,-40.476062,-40.475117,-40.498597,6.469,-395.999595,-398.643290,-401.014647,-372.471772
1,gdb_2,N,293.60975,293.541110,191.393970,1.6256,9.46,-0.2570,0.0829,0.3399,...,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316,-276.861363,-278.620271,-280.399259,-259.338802
2,gdb_3,O,799.58812,437.903860,282.945450,1.8511,6.31,-0.2928,0.0687,0.3615,...,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002,-213.087624,-213.974294,-215.159658,-201.407171
3,gdb_4,C#C,0.00000,35.610036,35.610036,0.0000,16.28,-0.2845,0.0506,0.3351,...,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574,-385.501997,-387.237686,-389.016047,-365.800724
4,gdb_5,C#N,0.00000,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,...,0.016601,-93.411888,-93.409370,-93.408425,-93.431246,6.278,-301.820534,-302.906752,-304.091489,-288.720028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133880,gdb_133881,C1C2C3C4C5OC14C5N23,3.59483,2.198990,1.904230,1.6637,69.37,-0.2254,0.0588,0.2842,...,0.127406,-400.633868,-400.628599,-400.627654,-400.663098,23.658,-1603.983913,-1614.898804,-1623.788097,-1492.819438
133881,gdb_133882,C1N2C3C2C2C4OC12C34,3.65648,2.142370,1.904390,1.2976,69.52,-0.2393,0.0608,0.3002,...,0.127495,-400.629713,-400.624444,-400.623500,-400.658942,23.697,-1601.376613,-1612.291504,-1621.181424,-1490.211511
133882,gdb_133883,C1N2C3C4C5C2C13CN45,3.67118,2.143140,1.895010,1.2480,73.60,-0.2233,0.0720,0.2953,...,0.140458,-380.753918,-380.748619,-380.747675,-380.783148,23.972,-1667.045429,-1678.830048,-1688.312964,-1549.143391
133883,gdb_133884,C1N2C3C4C5CC13C2C45,3.52845,2.151310,1.865820,1.9576,77.40,-0.2122,0.0881,0.3003,...,0.152222,-364.720374,-364.714974,-364.714030,-364.749650,24.796,-1794.600439,-1807.210860,-1817.286772,-1670.349892
