In [4]:
import data
from lightning.pytorch import LightningDataModule
from torch.utils.data import Dataset, DataLoader
from typing import Dict, List, Generator, Tuple
from torch.utils.data import Dataset
from torch import Tensor
import pandas as pd
import torch
import os

In [5]:
metadata: Dict[str, Dict[str,str]] = dict(
    lunar = dict(
        catalog = '../data/lunar/training/catalogs/apollo12_catalog_GradeA_final.csv',
        train_path = '../data/mars/training/data',
        test_path = '../data/mars/test/data',
    ),
    mars = dict(
        catalog = '../data/mars/training/catalogs/Mars_InSight_training_catalog_final.csv',
        train_path = '../data/lunar/training/data/S12_GradeA',
        test_path = '../data/lunar/test/data'
    )
)

def recursive_search(parent: str) -> Generator[str, None, None]:
    for child in os.listdir(parent):
        child_path = os.path.join(parent, child)
        if os.path.isdir(child_path):
            yield from recursive_search(child_path)
        elif child.endswith('.csv'):
            yield child_path

In [6]:
from datetime import timedelta
from msilib import sequence
from lightning.pytorch import LightningDataModule
from torch.utils.data import Dataset, DataLoader
from typing import Dict, List, Generator, Tuple
from torch.utils.data import Dataset
from torch import Tensor
import pandas as pd
import torch
import os


class TrainDataset(Dataset):
    def __init__(self, sequence_length: int, resample: timedelta) -> None:
        self.sequence_length = sequence_length
        self.resample = pd.Timedelta(resample)
        self.filepaths = [filename for filename in recursive_search('../data/mars/training/data')] + \
                        [filename for filename in recursive_search('../data/lunar/training/data')]
        self.meta_lunar: pd.DataFrame = pd.read_csv(metadata['lunar']['catalog'], index_col = ['filename'])
        self.meta_mars: pd.DataFrame = pd.read_csv(metadata['mars']['catalog'], index_col = ['filename'])
        self.metadata: pd.DataFrame = pd.concat(
            [
                self.meta_lunar,
                self.meta_mars,
            ], axis = 0
        )

    def zero_padding(self, x: torch.Tensor, length: int = 60) -> torch.Tensor:
        pad_size = length - x.size(-1)
        if pad_size > 0:
            return torch.nn.functional.pad(x, (0, pad_size))
        return x
    
    def preprocessing(self) -> None:
        self.data: List[Tuple[Tensor, Tensor]] = []
        for file in self.filepaths:
            try:
                arrive = self.metadata.loc[['time_rel(sec)'], file]
                out: Tensor = self.get_data(file)
                self.data.extend((input, target)) ### mirar
            except IndexError:
                continue

    def __len__(self) -> int:
        return len(self.metadata)

    def get_data(self, file: str) -> List[Tuple[Tensor, Tensor]]: ## atencion
        ## get target (tensor de 0 y 1s tal que el idx del arrival este coincidiendo con el arrival real)
        print(round(timedelta(seconds=self.metadata["time_rel(sec)"].loc[os.path.basename(file)])/self.resample))
        ## creas el tensor
        
        df: pd.DataFrame = pd.read_csv(file, parse_dates =['time_abs(%Y-%m-%dT%H:%M:%S.%f)'] ,index_col = ['time_abs(%Y-%m-%dT%H:%M:%S.%f)'])
        print(df.shape)
        velocity: Tensor = torch.from_numpy(df["velocity(c/s)"].resample(self.resample).mean().values)
        print(velocity.shape)
        print(self.resample)
        target = torch.zeros((velocity.shape))
        # target = 
        max: Tensor = torch.from_numpy(df.resample(self.resample).max().values)
        min: Tensor = torch.from_numpy(df.resample(self.resample).min().values)
        ### add the wavelet / fourier transform is needed

        ### separar de a sequence_length
        ### input(sequence_length, input_size) -> target(sequence_length) (0, 0, 1, 0)
        return [self.zero_padding(i) for i in velocity.split(self.sequence_length)] #target)

    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:
        filename: str = self.metadata[idx]
        return self.data[filename]

In [9]:
dataset = TrainDataset(60, timedelta(milliseconds=1))
len(dataset.get_data(dataset.filepaths[0]))

2130000
(72000, 2)
torch.Size([3599951])
0 days 00:00:00.001000


60000

In [8]:
td = timedelta(10, 30)
pd.Timedelta(td)

Timedelta('10 days 00:00:30')