In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent))
from time_series_loader.load_file import FileDataFrame
from time_series_loader.file_metadata_parser import TimeMetadataExtractor

In [None]:
# Full path to all data folders (inside you have "folder1" folder, "another_folder" folder, etc)
PATH = '~/data/'

full_path = Path(PATH).expanduser()
full_path

## Initialize the dataframe

In [24]:
files_1B = [
    'D2 - UPS Supply 1B - Data_Tot - 01-01-2024 00_00_00 - 10-14-2024 23_59_59.csv',
    'D2 - UPS Supply 1B - Data_Tot - 09-01-2022 00_00_00 - 12-31-2023 23_59_59.csv']
files_new = [
    '565Q4 - Data_All - 01-01-2022 00_00_00 - 12-31-2022 23_59_59.csv',
    '565Q4 - Data_All - 01-01-2023 00_00_00 - 12-31-2023 23_59_59.csv',
    '565Q4 - Data_All - 01-01-2024 00_00_00 - 12-31-2024 23_59_59.csv',
]
files_1A = [
   'D2 - UPS Supply 1A - DataTot - 09-01-2022 00_00_00 - 10-14-2024 23_59_59.csv',
]

# Here, files have dates of start and end in the names. I will use specialized extractor to work with them
files_1B = [full_path.joinpath('Main_LV_1B_Tot', file_1B) for file_1B in files_1B]
dataframe1B_D2 = FileDataFrame(files=files_1B, metadata_extractor=TimeMetadataExtractor())


## Steps to charge
### Discovering all files

We make certain assumptions regarding:
* The fact that the files should be sorted in subfolder for each armoire. And the subfolder name should contain its identifier (e.g. '1A', '2C', etc)
* The data should we in csv format
* They could be in one file or in several files for one type of a charge
* The format of the filename is 'armoire charge - Data - MM-DD-YYYY_HH_MM_SS - MM-DD-YYYY_HH_MM_SS.csv"

Product: file list

In [25]:
files = dataframe1B_D2.discover_files()

In [26]:
dataframe1B_D2.get_discovery_stats()

{'total_files_found': 2,
 'valid_files': 2,
 'invalid_files': 0,
 'invalid_files_details': []}

### Process metadata

We contruct metadata in this step that will be used to directly load and concatenate the files if necessary

Procuct: self.metadata

In [27]:
dataframe1B_D2.process_files(files)

[FileMetadata(filepath=WindowsPath('C:/Users/dbryzgal/data/_clients/load_monitoring_se/full/Main_LV_1B_Tot/D2 - UPS Supply 1B - Data_Tot - 09-01-2022 00_00_00 - 12-31-2023 23_59_59.csv'), start_time=datetime.datetime(2022, 9, 1, 0, 0), end_time=datetime.datetime(2023, 12, 31, 23, 59, 59), additional_metadata={'has_timestamp_metadata': True}),
 FileMetadata(filepath=WindowsPath('C:/Users/dbryzgal/data/_clients/load_monitoring_se/full/Main_LV_1B_Tot/D2 - UPS Supply 1B - Data_Tot - 01-01-2024 00_00_00 - 10-14-2024 23_59_59.csv'), start_time=datetime.datetime(2024, 1, 1, 0, 0), end_time=datetime.datetime(2024, 10, 14, 23, 59, 59), additional_metadata={'has_timestamp_metadata': True})]

In [28]:
dataframe1B_D2.get_processing_summary()

{'status': 'metadata_extracted',
 'errors': {'total': 0,
 'files': {'discovered': 2, 'valid': 2, 'invalid': 0, 'processed': 2},
 'data': {'loaded': False, 'rows': 0, 'columns': 0}}

### Load files

Load and concatenate (if needed csv files) into dataframes, validating that different files
have the same structure and creating concatenated metadata that have the following fields:
* number of lines
* number of files
* memory usage
* start time
* end time

Products: self.dataframe and self.concat_metadata

In [None]:
dataframe1B_D2.load_and_concatenate()
dataframe1B_D2.dataframe

In [30]:
dataframe1B_D2.get_concat_metadata()

{'total_rows': 73205,
 'total_files': 2,
 'memory_usage': np.int64(73790768),
 'start_time': datetime.datetime(2022, 9, 1, 0, 0),
 'end_time': datetime.datetime(2023, 12, 31, 23, 59, 59)}

### (Optinal) Check for time continuity problems in the data frames. Now it checks only for gaps

It can find the datetime column, it can detect the frequency of the time series, 
and also there is a minimum gap allowed (1 min by default)

Product: self.time_series_analysis

In [31]:
dataframe1B_D2.analyze_time_series_continuity(time_column='Date')


{'time_column': 'Date',
 'min_gap_size': '1min',
 'inferred_frequency': '900s',
 'total_points': 73205,
 'start_time': Timestamp('2022-09-07 17:00:00'),
 'end_time': Timestamp('2024-10-14 23:59:00'),
 'total_duration': Timedelta('768 days 06:59:00'),
 'total_gaps': 158,
 'total_gap_duration': Timedelta('7 days 10:00:00'),
 'coverage_percentage': 99.03465393518623,
 'gaps': [{'start_time': Timestamp('2022-09-08 01:45:00'),
   'end_time': Timestamp('2022-09-08 02:15:00'),
   'duration': Timedelta('0 days 00:30:00'),
   'expected_points': 1},
  {'start_time': Timestamp('2022-09-08 15:45:00'),
   'end_time': Timestamp('2022-09-08 16:30:00'),
   'duration': Timedelta('0 days 00:45:00'),
   'expected_points': 2},
  {'start_time': Timestamp('2022-09-15 09:45:00'),
   'end_time': Timestamp('2022-09-15 10:30:00'),
   'duration': Timedelta('0 days 00:45:00'),
   'expected_points': 2},
  {'start_time': Timestamp('2022-09-17 12:00:00'),
   'end_time': Timestamp('2022-09-17 13:15:00'),
   'duration

### (Optional) Resampling 

One can define frequency to resample to, but by default it takes detected frequency from time series report.

There are also methods to fill resulting for the future, but they are not used yet, as we need to diffentiate between nans that have been there before and resampled nans

Also, there is a way to exclude very large gaps

Product: resampled dataframe

In [None]:
resampled = dataframe1B_D2.resample_time_series(time_column='Date', frequency='15min')
resampled


In [33]:
print(len(resampled))
print(len(dataframe1B_D2.dataframe))

73756
73205


### To save time one can use initialise_processing() method to do first three mandatory steps 

In [34]:
files_1A = [full_path.joinpath('Main_LV_1A_Tot', file_1A) for file_1A in files_1A]
dataframe1A_D2 = FileDataFrame(files=files_1A)

# Another way to process files (combine all three steps in one)
dataframe1A_D2.initialize_processing()
dataframe1A_D2.get_concat_metadata()

{'total_rows': 39806,
 'total_files': 1,
 'memory_usage': np.int64(32680854),
 'start_time': None,
 'end_time': None}

## Get file without time in the filename

In [None]:
# Folder, not files
second_full_path = Path('~/data/examples_raw/02_Alamo').expanduser()

# Use a possibility to give a folder
alamo_ts = FileDataFrame(base_path=second_full_path)

In [36]:
# Process files
files = alamo_ts.discover_files()
alamo_ts.get_discovery_stats()

{'total_files_found': 2,
 'valid_files': 2,
 'invalid_files': 0,
 'invalid_files_details': [],
 'glob_pattern': '*',
 'recursive': False}

In [37]:
alamo_ts.process_files(files)
alamo_ts.get_processing_summary()

{'status': 'metadata_extracted',
 'errors': {'total': 0,
 'files': {'discovered': 2, 'valid': 2, 'invalid': 0, 'processed': 2},
 'data': {'loaded': False, 'rows': 0, 'columns': 0}}

In [38]:
alamo_ts.load_and_concatenate()
alamo_ts.get_concat_metadata()

{'total_rows': 518644,
 'total_files': 2,
 'memory_usage': np.int64(462758208),
 'start_time': None,
 'end_time': None}

In [None]:
alamo_ts.get_dataframe()