# Some processing of the data

In [8]:
import sys

if not '..' in sys.path:
    sys.path.append('..')

In [9]:
import numpy as np
from torch.utils.data import DataLoader

## Loading the univariate csv files

In [10]:
from rico_data_module import RICODataset, DatasetParams

In [11]:
real_dataset_params=DatasetParams(
    name = 'rico1',
    data_path='../datasets/rico1.csv',
    columns_identifier='ts_'
)

synth_dataset_params=DatasetParams(
    name = 'synth',
    data_path='../datasets/synth.csv',
    columns_identifier='ts_'
)

real_dataset = RICODataset(real_dataset_params)
synth_dataset = RICODataset(synth_dataset_params)

## Merging datasets

In [12]:
add_merged_dataset = real_dataset + synth_dataset
print(add_merged_dataset)

Name: rico1+synth 
Shape : torch.Size([358, 24, 1])


Or also : 

In [13]:
self_merged_dataset = real_dataset.merge(synth_dataset, name='new_name')
print(self_merged_dataset)

Name: new_name 
Shape : torch.Size([358, 24, 1])


## Converting dataset

To tensor :

In [14]:
torch_data = add_merged_dataset.to("torch")
print(type(torch_data))
print(torch_data.shape)

<class 'torch.Tensor'>
torch.Size([358, 24, 1])


To numpy:

In [15]:
torch_data = add_merged_dataset.to("numpy", _dtype="float32")
print(type(torch_data))
print(torch_data.shape)

<class 'numpy.ndarray'>
(358, 24, 1)


To pandas: (Only supported for univariate datasets)

In [16]:
pd_data = add_merged_dataset.to("pandas", _dtype="float32")
print(type(pd_data))
print(pd_data.shape)

<class 'pandas.core.frame.DataFrame'>
(358, 24)


## Trstr mix & Balanced batch sampling

In [17]:
from rico_data_module import trstr

In [18]:
train, test = trstr(real=real_dataset, synth = synth_dataset, r = 0.8, ignore_warnings=False)

synth_indices = train.ori_indices
real_indices = train.extra_indices

print(f"N° real points in train: {len(real_indices)} \nN° synth points in train: {len(synth_indices)}")
print(f"N° points in test : {len(test)}")

N° real points in train: 51 
N° synth points in train: 204
N° points in test : 51


⚠️ If there are not enough synthetic samples to support the specified r/s ratio, the highest possible ratio will be used instead (i.e all synthetic samples will be used). <br>
Try increasing r until the warning shows up.

In [19]:
train_outbounded, test_outbounded = trstr(real=real_dataset, synth = synth_dataset, r = 0.9, ignore_warnings=False)
print()
print(f"Length of train: {len(train_outbounded)} || is equal to 51 = len(real)/2 + 256 = len(synth)")
print(f"Length of test: {len(test_outbounded)}")

	[92mDefaulting to 1[0m

Length of train: 307 || is equal to 51 = len(real)/2 + 256 = len(synth)
Length of test: 51


## Balanced batch sampling
Will create batches with uniformly distributed real samples, ensuring that no batch is only synthetic

In [20]:
from rico_data_module import BalancedBatchSampler

In [21]:
sampler = BalancedBatchSampler(real_indices, synth_indices=synth_indices, batch_size=10)
loader = DataLoader(train, batch_sampler=sampler)

The code below shows the distribution of synth samples (1.0 elements) accross the batches.
Note how this should reflect the ratio specified earlier (0.8 in this case), except maybe for the last batch.

In [22]:
print([np.count_nonzero(batch[:,0] == 1.0)/len(batch) for batch in loader])

[0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8]


You're all set 🚀