In [10]:
from rdkit import Chem
from rdkit import DataStructs
from typing import List, Tuple
from sklearn.model_selection import train_test_split

import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import pandas as pd

In [5]:
# To be able to save conformer properties
Chem.SetDefaultPickleProperties(Chem.PropertyPickleOptions.AllProps) 

In [6]:
random.seed(42)

In [7]:
data_dir_path = 'data/'

In [8]:
with open(os.path.join(data_dir_path, 'raw', 'ccdc_generated_conf_ensemble_library.p'), 'rb') as f :
    all_CEL = pickle.load(f)

In [11]:
smiles_df = pd.read_csv(os.path.join(data_dir_path, 'smiles_df.csv'))

In [12]:
included_non_platinum_smiles = smiles_df[(~smiles_df['platinum']) & (smiles_df['included'])]['smiles'].values

In [13]:
all_smiles = [smiles for smiles, ce in all_CEL.get_unique_molecules() if smiles in included_non_platinum_smiles]

In [14]:
len(all_smiles)

11095

In [28]:
random_splits_dir_name = 'random_splits'
random_splits_dir_path = os.path.join(data_dir_path, random_splits_dir_name)
if not os.path.exists(random_splits_dir_path) :
    os.mkdir(random_splits_dir_path)

In [30]:
seed = 42
for i in range(5) :
    train_smiles, test_smiles = train_test_split(all_smiles, train_size=0.8, random_state=seed)
    val_smiles, test_smiles = train_test_split(test_smiles, train_size=0.5, random_state=seed)
    
    with open(os.path.join(random_splits_dir_path, f'train_smiles_random_split_{i}.txt'), 'w') as f :
        for smiles in train_smiles :
            f.write(smiles)
            f.write('\n')

    with open(os.path.join(random_splits_dir_path, f'val_smiles_random_split_{i}.txt'), 'w') as f :
        for smiles in val_smiles :
            f.write(smiles)
            f.write('\n')

    with open(os.path.join(random_splits_dir_path, f'test_smiles_random_split_{i}.txt'), 'w') as f :
        for smiles in test_smiles :
            f.write(smiles)
            f.write('\n')
    
    seed = seed + 1