In [1]:
import requests
import os
from tqdm import tqdm

In [None]:
# Dataset:
# https://github.com/deepchem/deepchem/blob/master/deepchem/molnet/load_function/zinc15_datasets.py

ZINC15_URL = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/"
filename = "zinc15_10M_2D.tar.gz"  # replace with the name of the file you want to download

url = os.path.join(ZINC15_URL, filename)

response = requests.get(url, stream=True)
response.raise_for_status()  # Raise an error for bad responses

# Get the total size of the file for tqdm
total_size = int(response.headers.get('content-length', 0))
block_size = 8192  # set block size

# Create tqdm progress bar
progress_bar = tqdm(total=total_size, unit='B', unit_scale=True, desc=filename)

with open(filename, 'wb') as file:
    for chunk in response.iter_content(block_size):
        progress_bar.update(len(chunk))
        file.write(chunk)

progress_bar.close()

In [2]:
import pandas as pd

df = pd.read_csv('datasets/zinc15_10M_2D.csv')
df.head()

Unnamed: 0,smiles,zinc_id,mwt,logp,reactive,purchasable,tranche_name
0,C[C@H]1CN(C(=O)/C=C/c2cn(C)c(=O)n(C)c2=O)CCN1,ZINC000237635743,292.339,-1.083,0,20,CAAD
1,C[C@H](CC(=O)Nc1ccc([N+](=O)[O-])cc1)NC/C=C/Cl,ZINC000898432048,297.742,2.654,5,20,CFBD
2,COc1ccc(CN2C[C@H]3CC[C@@]3(N)C2)c(OC)c1Cl,ZINC001235994924,296.798,2.28,0,20,CEAD
3,C=CCCOCC(=O)N1CCC([C@@H](C)NC(=O)OC(C)(C)C)CC1,ZINC000730242738,340.464,2.731,30,20,EFED
4,CCn1cc(NC(=O)NOCCSC)ccc1=O,ZINC000279935893,271.342,1.284,0,20,CDAD


In [3]:
path = "data/smiles/"

# Check if the directory exists
if not os.path.exists(path):
    os.makedirs(path)

# Export as txt
df['smiles'].to_csv('data/smiles/input.txt', index=False, header=False)