# Dataset Preparation Notebook
This notebook describes how to prepare the datasets used for both pre-training and fine-tuning of the chemical language models.  


## 1. Download Datasets

For reproducibility, the curated compound ID lists (i.e., **ChEMBL_ID**, **PubChem_CID**, or **ZINC_ID**) are provided in CSV format.  
Please retrieve the corresponding SMILES directly from the original databases using these IDs:

- **ChEMBL**: https://www.ebi.ac.uk/chembl/  
- **PubChem**: https://pubchem.ncbi.nlm.nih.gov/  
- **ZINC**: https://zinc15.docking.org/  
  
After downloading the SMILES data, standardize the molecular representations using RDKit canonicalization before saving.  
An example function for canonicalization is shown below:
```python
def canonicalize_smiles(smi: str):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return None
    return Chem.MolToSmiles(mol)
```
Save the canonicalized SMILES in **TSV** format with a column named `rdkit_smiles`, and place the data in the following directories:

- **Pre-training datasets** → `data/pretrain/sampled_datasets/`  
- **Fine-tuning datasets** → `data/finetune/target_actives/`  

## 2. Pre-training Datasets

In [None]:
import sys
sys.path.append('../')
from src.paths import DATA_DIR
dataset_list = ['pubchem_filtered_ac', 'pubchem_unfiltered_ac', 'pubchem_inac', 'chembl_filtered', 'chembl_unfiltered', 'zinc']

In [None]:
# SMILES average length
import numpy as np
import pandas as pd

rows = []
for dataset in dataset_list:
    df = pd.read_table(f'{DATA_DIR}/pretrain/sampled_datasets/{dataset}.tsv')
    smiles_list = df['rdkit_smiles']
    avg_length = np.mean([len(smi) for smi in smiles_list]) if len(smiles_list) > 0 else 0.0
    rows.append(avg_length)

avg_len_df = pd.DataFrame(rows, index=dataset_list, columns=['avg_length'])
avg_len_df.T.round(1)

### 2-1. Dataset SMILES Randomization

In [None]:
import sys
sys.path.append('../')
import pandas as pd
from src.dataset_curation import SmilesToRandomSmiles
from src.paths import DATA_DIR

filename_list = ['pubchem_filtered_ac',
                 'pubchem_unfiltered_ac', 
                 'pubchem_inac', 
                 'chembl_filtered', 
                 'chembl_unfiltered', 
                 'zinc']

for filename in filename_list:
    original_smiles = pd.read_table(f'{DATA_DIR}/pretrain/sampled_datasets/{filename}.tsv')
    original_smiles_list = original_smiles['rdkit_smiles'].to_list()
    print(f'{filename}')

    uniq_rdsmiles_list = SmilesToRandomSmiles(original_smiles['rdkit_smiles'], num=3, seed=42)
    uniq_rdsmiles_df   = pd.DataFrame(uniq_rdsmiles_list, columns=['rdkit_smiles'])
    uniq_rdsmiles_df.to_csv(f'{DATA_DIR}/pretrain/{filename}_rdsmi3.tsv', sep='\t')

### 2-2. Overlaps Aomong Pre-training Datasets

In [None]:
import pandas as pd
import numpy as np
from itertools import combinations
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from src.paths import FIGURES_DIR

display_names = {
    'pubchem_filtered_ac': 'PubChem Filtered Active',
    'pubchem_unfiltered_ac': 'PubChem Unfiltered Active',
    'pubchem_inac': 'PubChem Inactive',
    'chembl_filtered': 'ChEMBL Filtered',
    'chembl_unfiltered': 'ChEMBL Unfiltered',
    'zinc': 'ZINC',
}

smiles_dict = {}
for dataset in dataset_list:
    df = pd.read_table(f'{DATA_DIR}/pretrain/sampled_datasets/{dataset}.tsv')
    smiles_dict[dataset] = set(df['rdkit_smiles'].dropna())
    print(f"{dataset}: {len(smiles_dict[dataset])} unique SMILES")

overlap_matrix = pd.DataFrame(index=dataset_list, columns=dataset_list, dtype=int)

# Diagonal elements (self overlap)
for dataset in dataset_list:
    overlap_matrix.loc[dataset, dataset] = len(smiles_dict[dataset])

# Pairwise overlaps
for dataset1, dataset2 in tqdm(combinations(dataset_list, 2), desc="Computing overlaps"):
    overlap_count = len(smiles_dict[dataset1] & smiles_dict[dataset2])
    overlap_matrix.loc[dataset1, dataset2] = overlap_count
    overlap_matrix.loc[dataset2, dataset1] = overlap_count

# Create heatmap
plt.figure(figsize=(12, 10))
overlap_matrix_labeled = overlap_matrix.copy()
overlap_matrix_labeled.index = [display_names[d] for d in overlap_matrix.index]
overlap_matrix_labeled.columns = [display_names[d] for d in overlap_matrix.columns]

sns.heatmap(overlap_matrix_labeled.astype(int), annot=True, fmt='d', cmap='Blues', 
            square=True, cbar_kws={'label': 'Number of overlapping SMILES'})
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(f'{FIGURES_DIR}/datasets_overlap_heatmap.png', dpi=300, bbox_inches='tight')

## 3. Fine-tuning Datasets


In [None]:
import os
import sys
sys.path.append('../')
import pandas as pd
from sklearn.model_selection import train_test_split
from src.paths import DATA_DIR
from src.dataset_curation import filter_structural_alerts, SmilesToRandomSmiles

filtered_dir = f'{DATA_DIR}/finetune/filtered/'
unfiltered_dir = f'{DATA_DIR}/finetune/unfiltered/'

os.makedirs(filtered_dir, exist_ok=True)
os.makedirs(unfiltered_dir, exist_ok=True)

f_data_list = ['CHEMBL4005', 'CHEMBL1908389', 'CHEMBL284', 'CHEMBL214', 'CHEMBL253']

for f_data in f_data_list:
    data = pd.read_table(f'{DATA_DIR}/finetune/target_actives/{f_data}_actives.tsv')

    # unfiltered fine-tuning datasets
    train, test = train_test_split(data['rdkit_smiles'], train_size=0.2, random_state=42)
    train.to_csv(f'{unfiltered_dir}/unfiltered-{f_data}_train.tsv', sep='\t', index=False)
    test.to_csv(f'{unfiltered_dir}/unfiltered-{f_data}_test.tsv', sep='\t', index=False)

    rd3_train = SmilesToRandomSmiles(train, num=3, seed=42)
    rd3_train_df = pd.DataFrame(rd3_train, columns=['rd3_smiles'])
    rd3_train_df.to_csv(f'{unfiltered_dir}/unfiltered-{f_data}_train_rdsmi3.tsv', sep='\t', index=False)

    # filtered fine-tuning datasets
    rdkit_smi_filtered, pass_rate, _ = filter_structural_alerts(data, 'rdkit_smiles')
    print(f'{f_data} pass rate after RDKit filters: {pass_rate:.3f}')
    
    train_f, test_f = train_test_split(rdkit_smi_filtered['rdkit_smiles'], train_size=0.2, random_state=42)
    train_f.to_csv(f'{filtered_dir}/filtered-{f_data}_train.tsv', sep='\t', index=False)
    test_f.to_csv(f'{filtered_dir}/filtered-{f_data}_test.tsv', sep='\t', index=False)

    rd3_train_f = SmilesToRandomSmiles(train_f, num=3, seed=42)
    rd3_train_f_df = pd.DataFrame(rd3_train_f, columns=['rd3_smiles'])
    rd3_train_f_df.to_csv(f'{filtered_dir}/filtered-{f_data}_train_rdsmi3.tsv', sep='\t', index=False)
