## collect splits

In [1]:
!git clone https://github.com/yaringal/DropoutUncertaintyExps.git

Cloning into 'DropoutUncertaintyExps'...
remote: Enumerating objects: 1222, done.[K
remote: Total 1222 (delta 0), reused 0 (delta 0), pack-reused 1222 (from 1)[K
Receiving objects: 100% (1222/1222), 5.03 MiB | 12.85 MiB/s, done.
Resolving deltas: 100% (382/382), done.
Updating files: 100% (487/487), done.


In [2]:
# using code from https://github.com/yaringal/DropoutUncertaintyExps.git


import numpy as np
import pandas as pd

class SplitLoader:
    "load each split from the the given UCI dataset name"

    def __init__(self, data_directory):
        _DATA_DIRECTORY_PATH = "./DropoutUncertaintyExps/UCI_Datasets/" + data_directory + "/data/"
        self._DATA_DIRECTORY_PATH = _DATA_DIRECTORY_PATH
        _DATA_FILE = _DATA_DIRECTORY_PATH + "data.txt"
        _INDEX_FEATURES_FILE = _DATA_DIRECTORY_PATH + "index_features.txt"
        _INDEX_TARGET_FILE = _DATA_DIRECTORY_PATH + "index_target.txt"
        _N_SPLITS_FILE = _DATA_DIRECTORY_PATH + "n_splits.txt"

        data = np.loadtxt(_DATA_FILE)
        index_features = np.loadtxt(_INDEX_FEATURES_FILE)
        index_target = np.loadtxt(_INDEX_TARGET_FILE)

        self.X = data[ : , [int(i) for i in index_features.tolist()] ]
        self.y = data[ : , int(index_target.tolist()) ]

        self.n_splits = np.loadtxt(_N_SPLITS_FILE)
        # print name of the dataset and nsplits
        print ('Dataset: ' + data_directory)
        print ('Number of splits: ' + str(self.n_splits))

        
    def __iter__(self):
        yield from self.splits_generator()

    def splits_generator(self):
        for split_num in range(int(self.n_splits)):
            # We load the indexes of the training and test sets
            index_train_path = self._DATA_DIRECTORY_PATH + "index_train_" + str(split_num) + ".txt"
            index_test_path = self._DATA_DIRECTORY_PATH + "index_test_" + str(split_num) + ".txt"
            print ('Loading file: ' + index_train_path)
            print ('Loading file: ' + index_test_path)
            index_train = np.loadtxt(index_train_path)
            index_test = np.loadtxt(index_test_path)

            X_train = self.X[ [int(i) for i in index_train.tolist()] ]
            y_train = self.y[ [int(i) for i in index_train.tolist()] ]
            
            X_test = self.X[ [int(i) for i in index_test.tolist()] ]
            y_test = self.y[ [int(i) for i in index_test.tolist()] ]

            X_train_original = X_train
            y_train_original = y_train
            num_training_examples = int(0.8 * X_train.shape[0])
            X_validation = X_train[num_training_examples:, :]
            y_validation = y_train[num_training_examples:]
            X_train = X_train[0:num_training_examples, :]
            y_train = y_train[0:num_training_examples]
            
            # Printing the size of the training, validation and test sets
            print ('Number of training examples: ' + str(X_train.shape[0]))
            print ('Number of validation examples: ' + str(X_validation.shape[0]))
            print ('Number of test examples: ' + str(X_test.shape[0]))
            print ('Number of train_original examples: ' + str(X_train_original.shape[0]))
            yield (X_train, y_train, X_validation, y_validation, 
                X_test, y_test, X_train_original, y_train_original)
            

In [3]:
!pwd

/mnt/idms/home/jungadam/spring/experiment/data/uci_datasets


In [6]:
UCI_NAMES = ["bostonHousing", "concrete", "energy", "kin8nm", 
             "naval-propulsion-plant", "power-plant", "protein-tertiary-structure", 
             "wine-quality-red", "yacht"]

# UCI_NAMES = ["bostonHousing"]

from subprocess import call
import pandas as pd

EXPORTED_SPILTS_PATH = "exported_splits"
call(["mkdir", "-p", EXPORTED_SPILTS_PATH])

for ds_name in UCI_NAMES:
    print("________________________________\n\n")
    DS_PATH = EXPORTED_SPILTS_PATH + "/" + ds_name
    call(["mkdir", "-p", DS_PATH])

    ldr = SplitLoader(ds_name)
    for idx, split in enumerate(ldr):
        SPLIT_PATH = DS_PATH + "/split_" + str(idx)
        call(["mkdir", "-p", SPLIT_PATH])

        X_train, y_train, X_validation, y_validation, \
            X_test, y_test, X_train_original, y_train_original = split
        
        # save to csv
        pd.DataFrame(X_train).to_csv(SPLIT_PATH + "/X_train.csv", index=False)
        pd.DataFrame(y_train).to_csv(SPLIT_PATH + "/y_train.csv", index=False)
        pd.DataFrame(X_validation).to_csv(SPLIT_PATH + "/X_validation.csv", index=False)
        pd.DataFrame(y_validation).to_csv(SPLIT_PATH + "/y_validation.csv", index=False)
        pd.DataFrame(X_test).to_csv(SPLIT_PATH + "/X_test.csv", index=False)
        pd.DataFrame(y_test).to_csv(SPLIT_PATH + "/y_test.csv", index=False)

________________________________


Dataset: bostonHousing
Number of splits: 20.0
Loading file: ./DropoutUncertaintyExps/UCI_Datasets/bostonHousing/data/index_train_0.txt
Loading file: ./DropoutUncertaintyExps/UCI_Datasets/bostonHousing/data/index_test_0.txt
Number of training examples: 364
Number of validation examples: 91
Number of test examples: 51
Number of train_original examples: 455
Loading file: ./DropoutUncertaintyExps/UCI_Datasets/bostonHousing/data/index_train_1.txt
Loading file: ./DropoutUncertaintyExps/UCI_Datasets/bostonHousing/data/index_test_1.txt
Number of training examples: 364
Number of validation examples: 91
Number of test examples: 51
Number of train_original examples: 455
Loading file: ./DropoutUncertaintyExps/UCI_Datasets/bostonHousing/data/index_train_2.txt
Loading file: ./DropoutUncertaintyExps/UCI_Datasets/bostonHousing/data/index_test_2.txt
Number of training examples: 364
Number of validation examples: 91
Number of test examples: 51
Number of train_original 