# load surechembl dataset from https://ftp.ebi.ac.
# path = /pub/databases/chembl/SureChEMBL/data/map

In [1]:
import ftplib
from urllib.parse import urlparse
from tqdm import tqdm
import tarfile
import os
import shutil
import glob
import gzip
import os
import sys
import gc

current_dir = os.getcwd()
parent_parent_dir = os.path.dirname(os.path.dirname(current_dir))
src_dir = os.path.join(parent_parent_dir, 'src')
sys.path.append(src_dir)

from util import *

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
import warnings
warnings.simplefilter('ignore')
import pandas as pd
from datetime import datetime
from tqdm import tqdm

In [None]:
parsed = urlparse('https://ftp.ebi.ac.uk')
ftp = ftplib.FTP(parsed.netloc)
ftp.set_pasv('true')
ftp.login("anonymous", "aaa")

ftp.cwd('/pub/databases/chembl/SureChEMBL/data/map')
file_list = ftp.nlst(".")
files = [file for file in file_list if "txt" in file]

for i in tqdm(range(len(files))):
    parsed = urlparse('https://ftp.ebi.ac.uk')
    ftp = ftplib.FTP(parsed.netloc)
    ftp.set_pasv('true')
    ftp.login("anonymous", "aaa")
    ftp.cwd('/pub/databases/chembl/SureChEMBL/data/map')
    path = files[i]
    with open(f'../../data/raw/surechembl/{path}', 'wb') as f:
        ftp.retrbinary(f'RETR {path}', f.write)

In [None]:
source_directory = "../../data/raw/surechembl"
targz_files = glob.glob(os.path.join(source_directory, "*.txt.gz"))

for targz_file in tqdm(targz_files):
    with gzip.open(targz_file, 'rb') as f_in:
        output_file = targz_file.replace("raw", "defreezed").replace(".gz", "")
        with open(output_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [3]:
source_directory = "../../data/defreezed/surechembl/"
targz_files = glob.glob(os.path.join(source_directory, "*.txt"))
targz_files = sorted(targz_files)
tsv = []
output_file = "../../data/processed/surechembl/250106_surechembl_v0.tsv"

for file in targz_files:
    with open(file, "r") as f:
        for line in tqdm(f, desc=f"Processing {file}", leave=False):
            try:
                parts = line.split("\t")
                smiles = parts[1]
                date = parts[5]
                with open(output_file, "a") as out_file:
                    out_file.write(f"{smiles}\t{date}\n")
            except IndexError:
                continue

                                                                                                           

In [None]:
tsv = pd.read_csv("../../data/processed/surechembl/250106_surechembl_v0.tsv", sep="\t", header=None)

In [5]:
print(len(tsv))

373616491


In [6]:
sorted_tsv = tsv.sort_values([1]).reset_index(drop=True)

In [7]:
tsv_unique = sorted_tsv.drop_duplicates(subset=0, keep="first")
print(len(tsv_unique))
print(len(tsv)-len(tsv_unique), "DELETED!!")

23465171
350151320 DELETED!!


In [8]:
tsv_unique.head()

Unnamed: 0,0,1
0,OC1=C2C=CC=CC2=CC2=CC3=C(CC=CC3)C=C12,1962-01-30
1,N1C=CN=C(C2=NC=CC=C2)C2=C1C=CC=C2,1963-08-13
2,N1C2=CC=CC=C2C=NC=C1C1=CN=CC=C1,1963-08-13
3,C[C@H]1[C@H]2[C@H](C[C@H]3[C@@H]4CCC5C[C@@H](O...,1965-02-16
4,C[C@H]1[C@H]2[C@H](C[C@H]3[C@@H]4CC[C@@H]5C[C@...,1965-02-16


In [None]:
tsv_unique.columns = ["SMILES", "DATE"]
tsv_unique.to_csv("../../data/processed/surechembl/250106_sorted.tsv", sep="\t", index=False)

In [2]:
tsv_unique = pd.read_csv("../../data/processed/surechembl/250106_sorted.tsv", sep="\t")

In [3]:
tsv_unique.head()

Unnamed: 0,SMILES,DATE
0,OC1=C2C=CC=CC2=CC2=CC3=C(CC=CC3)C=C12,1962-01-30
1,N1C=CN=C(C2=NC=CC=C2)C2=C1C=CC=C2,1963-08-13
2,N1C2=CC=CC=C2C=NC=C1C1=CN=CC=C1,1963-08-13
3,C[C@H]1[C@H]2[C@H](C[C@H]3[C@@H]4CCC5C[C@@H](O...,1965-02-16
4,C[C@H]1[C@H]2[C@H](C[C@H]3[C@@H]4CC[C@@H]5C[C@...,1965-02-16


In [4]:
def process_smiles_date(args):
    old, date_raw = args
    try:
        date = date_raw.replace("-", "")
        date_decimal = convert_date_to_decimal(date)
        mol = Chem.MolFromSmiles(old)
        if mol is None:
            return old, date_decimal
        new = Chem.MolToSmiles(mol)
        return new, date_decimal
    except:
        return None

In [5]:
import csv
from tqdm import tqdm

In [6]:
def process_smiles_single(tsv_unique, output_csv):
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["SMILES", "Date"])

        for i in tqdm(range(len(tsv_unique)), desc="Processing"):
            result = process_smiles_date((tsv_unique.iloc[i, 0], tsv_unique.iloc[i, 1]))
            if result is not None:
                writer.writerow(result)

In [3]:
output_csv = "../../data/processed/surechembl/250106_sorted_rdkit.csv"
# process_smiles_single(tsv_unique, output_csv)

In [3]:
df_processed = pd.read_csv("../../data/processed/surechembl/250106_sorted_pcp.csv") 

In [4]:
df_processed

Unnamed: 0,SMILES,Date
0,Oc1c2ccccc2cc2cc3c(cc12)CC=CC3,1962.082
1,C1=CNc2ccccc2C(c2ccccn2)=N1,1963.616
2,C1=NC=C(c2cccnc2)Nc2ccccc21,1963.616
3,C[C@@H]1CC[C@@]2(OC1)O[C@H]1C[C@H]3[C@@H]4CCC5...,1965.129
4,C[C@@H]1CC[C@@]2(OC1)O[C@H]1C[C@H]3[C@@H]4CC[C...,1965.129
...,...,...
23465166,COCCCOc1ccc(C(=O)Nc2cc(-c3nc4ccccc4[nH]3)[nH]n...,2023.992
23465167,O=C(c1cccnc1-c1ccc(-c2n[nH]c3ncc(-c4ccc5c(c4)C...,2023.992
23465168,O=C(NCCN1CCCCC1C(=O)O)OCC1c2ccccc2-c2ccccc21,2023.992
23465169,CCN(CC)C(=O)[C@@H]1C=C2c3cccc4[nH]cc(c34)C[C@H...,2023.992


In [5]:
smiles_date = dict()
for i in tqdm(range(len(df_processed))):
    smiles = df_processed.iloc[i,0]
    date = df_processed.iloc[i,1]
    smiles_date[smiles] = date

100%|██████████| 23465171/23465171 [50:20<00:00, 7769.14it/s]  


In [6]:
pickle_dump(smiles_date, "../../data/processed/surechembl/250106_surechembl_smiles_date.pickle")