# Preparation of train and test sets for TPZ

_Authors: Andreia Dourado, Bruno Moraes_

__Description: Splitting of the training set into training (70%) and test (30%) subsets, saved as .hdf5 files in a format compatible with the TPZ algorithm.__

### 1. Imports

In [None]:
import pandas as pd
import tables_io
import h5py
import numpy as np

### 2. Reading the data:

In [None]:
path_data = '../dados_tcc/training_sets/training_set.parquet'
print(path_data)

In [None]:
data_convert=pd.read_parquet(path_data)
data_convert.shape

### 3. Train file

In [None]:
path_run = f'../dados_tcc/run_files/'
print(path_run)

#### 3.1 Selecting the fraction of objects for training:

In [None]:
fraction = int(0.701* len(data_convert))
fraction

In [None]:
training_csv = data_convert.sample(fraction,random_state=40)
training_csv

#### 3.2 Hdf5 file:

In [None]:
train_file_path =f'{path_run}train_file.hdf5'
print(train_file_path)

##### With group:

In [None]:
with h5py.File(train_file_path, 'w') as train_file:
    photometry_group = train_file.create_group('photometry')
    for column in training_csv.columns:
        photometry_group.create_dataset(column, data=training_csv[column].values)

##### Without Group:

In [None]:
with h5py.File(train_file_path, 'w') as train_file:
    for column in training_csv.columns:
        train_file.create_dataset(column, data=training_csv[column].values)

#### 3.3 Checking:

In [None]:
train_table = tables_io.read(train_file_path, fmt='hdf5')
train_table

In [None]:
len(train_table['mag_g'])

### 4. Teste file:

#### 4.1 Selecting the remaining objects:

In [None]:
validation= data_convert.drop(training_csv.index)
validation

#### 4.1 Hdf5 file:

In [None]:
test_file_path = f'{path_run}test_file.hdf5'
print(test_file_path)

##### With group:

In [None]:
with h5py.File(test_file_path, 'w') as test_file:
    photometry_group = test_file.create_group('photometry')
    for column in training_csv.columns:
        photometry_group.create_dataset(column, data=validation[column].values)

##### Without group:

In [None]:
with h5py.File(test_file_path, 'w') as test_file:
    for column in validation.columns:
        test_file.create_dataset(column, data=validation[column].values)

#### 4.2 Checking:

In [None]:
test_table = tables_io.read(test_file_path, fmt='hdf5')
test_table

In [None]:
len(test_table['mag_g'])

In [None]:
len(test_table['photometry']['mag_g'])