In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow.dataset as pa_dataset

import torch
from torch.utils.data import Dataset, Subset, DataLoader
from sklearn.model_selection import train_test_split

from typing import Any, Dict, List, Union, Tuple

try:
    from typing import Literal
except ImportError:
    from typing_extensions import Literal


class IndexedDatasetFromFiles(Dataset):
    """Dataset for loading data vectors from `.npy` files."""

    def __init__(
        self,
        example_names: list,
        features: Dict[str, np.array],
        targets: Dict[str, np.array] = None,
    ):
        """
        Parameters
        ----------
            example_names: list
                List contains names of examples from a dataset

            features: Dict[str, float]
                Dictionary contains for each example its name and 
                vector with features

            targets: Dict[str, np.array] = None
                Dictionary contains for each example its name and 
                true label
        """

        super(IndexedDatasetFromFiles, self).__init__()

        self._example_names = example_names
        
        self._features = features
        self._targets = targets

    def __getitem__(self, idx: int):
        """
        Get vectors with features and label by idx of `file_name` 
        in `all_data_files`
        """

        example_name = self._example_names[idx]
        x = self._features[example_name]
        x = np.asarray(x)
        x = torch.from_numpy(x).float()

        if self._targets is not None:
            target = self._targets[example_name]
            y = np.asarray(target)
            y = torch.from_numpy(y).int()
            return x, y, idx

        else:
            y = None
            return x, idx
            

    def __len__(self):
        return len(self._example_names)

    
    def get_file_name(self, idx: int):
        """Get example by idx"""
        example_name = self._example_names[idx]
        return example_name


def split_array_into_twoparts_by_inds(
    ar: np.array,
    random_state: int,
    split_fraction: float,
):
    """
    Divide input array by two parts and return indices 
    for each part

    Args:
        ar: np.array
            Input array

        random_state: int
            To provide reproducibility

        split_fraction: float
        Relation between sizes of the first part and the input array.
        If it is equal 1.0, the first part size is equal to 
        the input array size.
    """
    original_ids = np.array(range(len(ar)))
    
    inds_pt1, inds_pt2 = train_test_split(
        original_ids, 
        train_size=split_fraction, 
        random_state=random_state, 
        shuffle=False)

    return inds_pt1, inds_pt2


def create_datasets(
    features_path: str,
    random_state: int,
    features_dim: int,
    mode: Literal['predict', 'fit', 'forgetting', 'second-split-forgetting'],
    targets_path: str = None,
    path_to_file_names_to_be_excluded: str = None,
    split_fraction: float = None
):
    """
    Create datasets from files containing in the directory `data_filepath` 

    Args:
        data_filepath: str
            Path to a directory which contains all files of the dataset.

        random_state: int
            To provide reproducibility.

        features_dim: int
            Dimension (the number of components) of the feature vector

        mode: str
            It takes one of the values 'predict', 'fit', 'forgetting' or 'second-split-forgetting'. 
            Depending on the value of the argument, datasets will be created to train 
            the model, to get predictions or to find noisy examples by forgetting methods
        
        targets_path: str = None
            Path to a directory which contains files with true labels.
            It is supposed that the files containing features and true label 
            related to one example from the dataset have the same name. If it is `None`, 
            target variable will not be returned.

        path_to_file_names_to_be_excluded: str
            Path to a `.txt` file which contains names of files 
            to be excluded from the original dataset.

        split_fraction: float
        Relation between sizes of the first part and the input array.
    """

    #Load pyarrow datasets
    ds_features = pa_dataset.dataset(features_path)
    ds_targets = pa_dataset.dataset(targets_path)

    #get np arrays
    if path_to_file_names_to_be_excluded is None:
        example_names = np.array(
            ds_features.scanner(
                columns=['__index_level_0__']
            ).to_table()
        )[0]

        features = np.array(
            ds_features.scanner(
                columns=[str(item) for item in range(0, features_dim)]
            ).to_table()
        )

        targets = ds_targets.to_table()
        labels = np.array(targets[0])
        target_names = np.array(targets[1])

    else:
        file_name = path_to_file_names_to_be_excluded
        excluded_names = np.loadtxt(file_name, delimiter=' ', dtype='str')

        example_names = np.array(
            ds_features.scanner(
                columns=['__index_level_0__'],
                filter=(~pa_dataset.field('__index_level_0__').isin(excluded_names))
            ).to_table()
        )[0]

        features = np.array(
            ds_features.scanner(
                columns=[str(item) for item in range(0, features_dim)],
                filter=(~pa_dataset.field('__index_level_0__').isin(excluded_names))
            ).to_table()
        )

        targets = ds_targets.scanner(
            filter=(~pa_dataset.field('__index_level_0__').isin(excluded_names))
        ).to_table()
        labels = np.array(targets[0])
        target_names = np.array(targets[1])

        print(f"From the dataset {len(excluded_names)} files are excluded.")

    #create dictionaries with features and targets
    features = features.T
    features = {k: v for k, v in zip(example_names, features)}

    targets = {k: int(v) for k, v in zip(target_names, labels)}

    #create torch dataset
    dataset = IndexedDatasetFromFiles(example_names, features, targets)


    #create subsets
    if mode == 'predict':
        dataset_pt1 = dataset
        dataset_pt2 = None

        return dataset_pt1

    elif mode == 'fit':
        if split_fraction is None:
            split_fraction = 1.0

        if split_fraction < 1.0:

            inds_pt1, inds_pt2 = split_array_into_twoparts_by_inds(
                example_names, 
                random_state,
                split_fraction
            )

            dataset_pt1 = Subset(dataset, inds_pt1)
            dataset_pt2 = Subset(dataset, inds_pt2)
        
        else:
            dataset_pt1 = dataset
            dataset_pt2 = dataset

        return dataset_pt1, dataset_pt2
    
    elif mode == 'forgetting':
        dataset_pt1 = dataset
        dataset_pt2 = None

        return dataset_pt1
    
    elif mode == 'second-split-forgetting':
        if split_fraction is None:
            split_fraction = 0.5

        inds_pt1, inds_pt2 = split_array_into_twoparts_by_inds(
            example_names, 
            random_state,
            split_fraction
        )

        dataset_pt1 = Subset(dataset, inds_pt1)
        dataset_pt2 = Subset(dataset, inds_pt2)

        return dataset_pt1, dataset_pt2
    else:
        raise ValueError('That mode is unknown')


def create_dataloader(
    dataset: Dataset,
    random_state: int,
    batch_size: int,
    is_shuffle: bool,
    num_workers: int,
    is_pin_memory: bool
):
    """Create a torch dataloader from a dataset"""

    g = torch.Generator()
    g.manual_seed(random_state)

    torch_loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=is_shuffle,
        num_workers=num_workers,
        pin_memory=is_pin_memory,
        generator=g
    )

    return torch_loader

ModuleNotFoundError: No module named 'pyarrow'

In [20]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa

In [32]:
train_path = '/home/storage/priceseekers/data/rosbank/rosbank_dataset/train_part/rosbank_targets.parquet'
test_path = '/home/storage/priceseekers/data/rosbank/rosbank_dataset/test_part/rosbank_targets.parquet'
path_to_save = '/home/storage/priceseekers/data/rosbank/rosbank_dataset/full/rosbank_targets.parquet'

In [33]:
df_train = pq.read_table(train_path).to_pandas()
df_test = pq.read_table(test_path).to_pandas()

In [34]:
df_full = pd.concat([df_train, df_test])

In [35]:
df_full = pa.Table.from_pandas(df_full)

In [36]:
pq.write_table(df_full, path_to_save)

In [None]:
.to_parquet(path_to_save)

In [1]:
features_path = '/home/storage/priceseekers/data/rosbank/rosbank_dataset/train_part/rosbank_pca_embeddings.parquet'
features_dim = 112
targets_path = '/home/storage/priceseekers/data/rosbank/rosbank_dataset/train_part/rosbank_targets.parquet'
path_to_file_names_to_be_excluded  = None

random_state = 1
split_fraction = 0.8

In [130]:
inds_pt1, inds_pt2 = split_array_into_twoparts_by_inds(
    example_names, 
    random_state,
    split_fraction
)

dataset_pt1 = Subset(dataset, inds_pt1)
dataset_pt2 = Subset(dataset, inds_pt2)

In [137]:
dataset_pt2[250]

(tensor([-1.9074e+00, -8.2724e-01, -5.9442e-01, -9.3966e-01,  1.0408e-01,
          3.1565e-02, -2.6306e-01, -7.4423e-01,  4.8671e-01,  4.8730e-01,
         -4.8939e-01, -5.9958e-02, -2.9994e-01, -1.1231e-01,  1.1989e-01,
          7.5383e-02,  1.4384e-01,  1.7608e-01,  1.4159e-01, -4.1005e-02,
         -3.2129e-02, -2.3837e-01,  7.4586e-02, -1.1077e-02, -4.4204e-02,
         -1.2154e-02, -1.6696e-01,  1.4630e-02,  1.2890e-01, -1.3122e-01,
          2.1491e-02, -3.4994e-02, -3.0660e-02, -2.3411e-02, -2.8227e-02,
         -1.8308e-02,  9.7461e-04,  6.2755e-03, -2.6535e-02,  1.5229e-02,
          1.6535e-02, -6.9116e-03, -1.1298e-02,  1.2000e-02, -6.5124e-03,
         -3.6738e-03, -3.4251e-03, -1.6331e-04,  2.3690e-03,  1.4011e-02,
          2.2371e-02,  3.0301e-03, -1.3432e-02,  2.1075e-02,  9.9930e-03,
          1.0596e-02, -1.7536e-02, -1.0270e-03, -1.3394e-02, -1.6774e-03,
          2.2909e-03,  5.5026e-03, -6.6627e-03, -5.0395e-04, -5.1123e-03,
         -4.4063e-03,  1.0375e-03, -4.

In [134]:
len(dataset_pt2)

900