In [2]:
import pandas as pd
import os
import sklearn
from sklearn.model_selection import KFold
import numpy as np
from pathlib import Path

## Datasets

In [15]:
#Generate seed
SEED = np.random.randint(1000, 999999)
print(f'The seed is {SEED}')

The seed is 569490


In [12]:
#Get file locations
PATH_ROOT = Path(os.getcwd()).absolute().parent
maccs_dir = os.path.join(PATH_ROOT, 'data', 'maccs')
path_output = os.path.join(PATH_ROOT, 'experiments', 'cv_indices')

#Used to decide the dataset to create cv's for
dataset_names = ["BBBP", "HIV", "MUV", "SIDER", "Tox21"]

#Get MACCS paths depending on which datasets you want the cv for
maccs_path = [os.path.join(maccs_dir, x) for x in os.listdir(maccs_dir) if x.split("_")[0] in dataset_names]

column_names = [f'cv{i}' for i in range(1, 6)]

In [16]:
#Create cv per dataset
for i in range(len(maccs_path)):
    #Read csv file
    df = pd.read_csv(maccs_path[i])
    col = [x for x in df.columns if 'y' in x]
    
    #Get X and y
    y = df[col].to_numpy()
    X = df.drop(col, axis=1).to_numpy()
    
    #Create CV splits
    kf = KFold(random_state=SEED, shuffle=True)
    splits = kf.split(X)
    
    #Datafrane for each cv split
    train = pd.DataFrame()
    test = pd.DataFrame()
    
    #Append each cv split to the dataframe
    for x_train, x_test in splits:
        train_pd = pd.DataFrame(x_train)
        test_pd = pd.DataFrame(x_test)

        train = pd.concat([train, train_pd], axis=1)
        test = pd.concat([test, test_pd], axis=1)
    
    
    train = train.astype(pd.Int64Dtype())
    test = test.astype(pd.Int64Dtype())
    train.columns = column_names
    test.columns = column_names
    
    #Create csv
    train.to_csv(os.path.join(path_output, f'{dataset_names[i]}_cv_train.csv'), index=False)
    test.to_csv(os.path.join(path_output, f'{dataset_names[i]}_cv_test.csv'), index=False)
    print(f"Created cv split for {dataset_names[i]}.")