In [1]:
import pandas as pd
import torch
import numpy as np
import networkx as nx
from scipy.spatial.distance import euclidean
import torch
from sklearn.model_selection import train_test_split
import hyperparameters as hp
import random

pd.set_option('display.max_rows', 500)

path = hp.FLOWDATA_PATH

In [2]:
def find_long_nan_sections(df, max_gap):
    # create a mask for all of the rows with missing values
    missing_vals = df.isna()
    
    prev_row_missing_vals = missing_vals.shift()

    # find the rows where the value of a sensor changes from nan > value, or value > nan
    transition_rows = missing_vals != prev_row_missing_vals

    # assign an id number to each block of vals
    block_ids = transition_rows.cumsum()

    # find the length of each gap
    gap_lengths = missing_vals.groupby(block_ids).transform('sum')

    # identify all gaps which are longer than 4 hours
    long_gaps = missing_vals & (gap_lengths > max_gap)

    return long_gaps

In [3]:
# Reading in the flow data file
flowdata_df = pd.read_csv(path, index_col=0)
flowdata_df.index = pd.to_datetime(flowdata_df.index, format='%d/%m/%Y %H:%M')

# Removing a sensor with a large number of missing values
flowdata_df = flowdata_df.drop('1615', axis=1)

flowdata_df = flowdata_df.rename(columns=hp.SENSOR_DMA_TO_ID)
flowdata_df = flowdata_df.sort_index(axis=1)

# Removing rows which have outliers or are part of long sections of missing values
rows_to_remove = pd.Series(False, index=flowdata_df.index) 
for col in flowdata_df.columns:
    rows_to_remove |= find_long_nan_sections(flowdata_df[col], hp.MAX_GAP)
    #rows_to_remove |= find_outlier_values(flowdata_df[col])
flowdata_df = flowdata_df[rows_to_remove == False]

# Imputing short ranges of missing values
flowdata_df = flowdata_df.interpolate(method='spline', order = 3)

In [19]:
def month_based_train_val_test_split(flowdata_df, train_val_test_ratios):
    # get all the months
    df_month_strata = pd.DataFrame(index=flowdata_df.index)
    # Use the index instead of a 'timestamp' column
    df_month_strata['year_month'] = flowdata_df.index.to_period('M')
    print(f"Rows per month: {df_month_strata['year_month'].value_counts().sort_index()}")

    months = df_month_strata['year_month'].unique()
    random.shuffle(months)

    # assign each month to a set
    num_months = df_month_strata['year_month'].nunique()
    train_ratio, val_ratio, test_ratio = train_val_test_ratios

    num_months_train = int(num_months * train_ratio)
    num_months_val = int(num_months * val_ratio)
    num_months_test = num_months - num_months_train - num_months_val

    train_months = months[:num_months_train]
    val_months = months[num_months_train:num_months_train + num_months_val]
    test_months = months[num_months_train + num_months_val:]

    # split the data into the sets
    train_df = flowdata_df[df_month_strata['year_month'].isin(train_months)]
    val_df = flowdata_df[df_month_strata['year_month'].isin(val_months)]
    test_df = flowdata_df[df_month_strata['year_month'].isin(test_months)]

    return train_df, val_df, test_df


train_df, val_df, test_df = month_based_train_val_test_split(flowdata_df, hp.TRAIN_VAL_TEST_SPLIT)

for name, df in zip(["train", "val", "test"], [train_df, val_df, test_df]):
     print(f'Rows in {name} set:', len(df))


Rows per month: year_month
2020-01    2976
2020-02    2699
2020-03    2976
2020-04    2756
2020-05    1581
2020-06    2067
2020-07    2147
2020-08    2098
2020-09    1522
2020-10     200
2020-11     985
2020-12    1157
2021-01    2361
2021-02    2688
2021-03    2976
2021-04    2880
2021-05     457
2022-02     584
2022-03    2029
2023-05    2195
2023-06    2817
2023-07    2221
2023-08    2884
2023-09    2796
2023-10    2757
2023-11    2354
2023-12    2549
2024-01    1969
2024-02    2519
2024-03    2698
2024-04    1885
2024-05    2804
2024-06    2317
2024-07    2495
2024-08    1220
2024-09     459
Freq: M, Name: count, dtype: int64
Rows in train set: 52666
Rows in val set: 8875
Rows in test set: 14537


In [5]:
def assign_strata(df):
    """
    Takes a timeseries dataframe, and creates a corresponding series specificying each row's strata.
    """

    strata_dict = {
        'time_of_day': {
            'feature_origin': df.index.hour,
            'bins': [0, 6, 12, 14, 18, 22, 24],  
            'labels': ['night', 'morning', 'midday', 'afternoon', 'evening', 'night']
        },
        'part_of_week': {
            'feature_origin': df.index.dayofweek,
            'bins': [0, 5, 7], 
            'labels': ['weekday', 'weekend']
        },
        'season': {
            'feature_origin': df.index.month,
            'bins': [0, 3, 6, 9, 12, 13], 
            'labels': ['winter', 'spring', 'summer', 'autumn', 'winter']
        }
    }

    strata_df = pd.DataFrame(index=df.index)

    for strata_name, strata_info in strata_dict.items():
        strata_df[strata_name] = pd.cut(
            strata_info['feature_origin'],
            bins=strata_info['bins'],
            labels=strata_info['labels'],
            right=False,  
            include_lowest=True,
            ordered=False
        )

    strata_df['strata'] = strata_df['part_of_week'].astype(str) + '_' + strata_df['season'].astype(str)

    return strata_df

dfs = [train_df, val_df, test_df]
overlap = [True, True, False]

for i, df in enumerate(dfs):
    # Actual code
    strata_df = assign_strata(df)

In [10]:
def create_samples(df, df_strata, sample_length, set_name, overlap):
    """
    Creates samples of the data with a given length and overlap.

    Note: 
    - Currently all samples will start at the same time of day, as each one is 5 days long
    - This would make the model struggle with differently timed inputs
    - I will need to eventually add random starting points for the samples, but for now I will just create the samples with a fixed starting point to test the model
    """

    samples_df = pd.DataFrame(columns=hp.SENSOR_COLS)
    samples_strata_df = pd.Series(dtype=str, index=samples_df.index)

    # Find all the gaps in the data (where there are missing time steps / it is not continuous)
    gap_mask = df.index.to_series().diff() > pd.Timedelta(minutes=15)

    time_diffs = df.index.to_series().diff()

    # Identify actual gaps (where diff > 15 minutes)
    gap_durations = time_diffs[time_diffs > pd.Timedelta(minutes=15)]
    print(f'\nGap summary for {set_name} set:')
    for idx, gap in gap_durations.items():
        prev_time = df.index[df.index.get_loc(idx) - 1]
        print(f"Gap before {idx}: {gap} (from {prev_time} to {idx})")

    # Split the data into all of the continous segments
    df = df.copy()
    df['segment_id'] = gap_mask.cumsum()
    segments_df = df.groupby('segment_id')

    print(f'Segments in {set_name} set: {len(segments_df)}')

    if overlap:
            step = sample_length // 2
    else:
        step = sample_length

    # Split the data into all of the continuous segments and iterate through each segment
    for _, segment in segments_df:

        print(f'Processing segment {segment["segment_id"].iloc[0]} in {set_name} set, length: {len(segment)}, from {segment.index[0]} to {segment.index[-1]}')

        # Get rid of the segment_id column
        segment = segment.drop(columns='segment_id')

        strata_segment = df_strata.loc[segment.index]

        num_samples = 0
        i = 0
        while i + sample_length <= len(segment):
            index = len(samples_df)

            sample_row = {}
            strata_row = {}
            sample = segment[i:i + sample_length]
            strata_sample = strata_segment[i:i + sample_length]

            for col in hp.SENSOR_COLS:
                sample_row[col] = sample[col].values
            strata_row = strata_sample.values

            samples_df.loc[index] = sample_row
            samples_strata_df.loc[index] = strata_row

            i += step
            num_samples += 1
        
        print(f'Created {num_samples}')
        

    
    return samples_df, samples_strata_df

window_size = hp.TOTAL_WINDOW
train_df, val_df, test_df = month_based_train_val_test_split(flowdata_df, hp.TRAIN_VAL_TEST_SPLIT)
dfs = [train_df, val_df, test_df]
overlap = [True, True, False]

set_names = ['train', 'val', 'test']
results = {name: {} for name in set_names}

for i, df in enumerate(dfs):
    # Actual code
    set_name = set_names[i]
    strata_df = assign_strata(df)
    samples_df, samples_strata_df = create_samples(df, strata_df, window_size, set_name, overlap=overlap[i])

    print(f'Samples created from {set_name} set:', len(samples_df))
    #if len(samples_df) > 0:
        #print(f'Sample start times in {set_name} set (first 10):', samples_df.index[:10])

    #print(f'Samples per strata in {set_name} set: {samples_strata_df["strata"].value_counts()}')'train', 'val', 'test']
results = {name: {} for name in set_names}

for i, df in enumerate(dfs):
    # Actual code
    set_name = set_names[i]
    strata_df = assign_strata(df)
    samples_df, samples_strata_df = create_samples(df, strata_df, window_size, set_name, overlap=overlap[i])

    print(f'Samples created from {set_name} set:', len(samples_df))
    #if len(samples_df) > 0:
        #print(f'Sample start times in {set_name} set (first 10):', samples_df.index[:10])

    #print(f'Samples per strata in {set_name} set: {samples_strata_df["strata"].value_counts()}')

Rows per month: year_month
2020-01    2976
2020-02    2699
2020-03    2976
2020-04    2756
2020-05    1581
2020-06    2067
2020-07    2147
2020-08    2098
2020-09    1522
2020-10     200
2020-11     985
2020-12    1157
2021-01    2361
2021-02    2688
2021-03    2976
2021-04    2880
2021-05     457
2022-02     584
2022-03    2029
2023-05    2195
2023-06    2817
2023-07    2221
2023-08    2884
2023-09    2796
2023-10    2757
2023-11    2354
2023-12    2549
2024-01    1969
2024-02    2519
2024-03    2698
2024-04    1885
2024-05    2804
2024-06    2317
2024-07    2495
2024-08    1220
2024-09     459
Freq: M, Name: count, dtype: int64


Rows per month: year_month
2020-01    2976
2020-02    2699
2020-03    2976
2020-04    2756
2020-05    1581
2020-06    2067
2020-07    2147
2020-08    2098
2020-09    1522
2020-10     200
2020-11     985
2020-12    1157
2021-01    2361
2021-02    2688
2021-03    2976
2021-04    2880
2021-05     457
2022-02     584
2022-03    2029
2023-05    2195
2023-06    2817
2023-07    2221
2023-08    2884
2023-09    2796
2023-10    2757
2023-11    2354
2023-12    2549
2024-01    1969
2024-02    2519
2024-03    2698
2024-04    1885
2024-05    2804
2024-06    2317
2024-07    2495
2024-08    1220
2024-09     459
Freq: M, Name: count, dtype: int64


KeyError: 'segment_id'

In [25]:
def strat_random_sample(samples_df, strata_df):
    # Merge the samples_df with the strata_df to get the strata labels for each sample
    merged_df = samples_df.merge(strata_df, left_index=True, right_index=True)

    # Perform stratified random sampling
    sampled_df = merged_df.groupby('strata_label', group_keys=False).apply(lambda x: x.sample(hp.SAMPLE_SIZE, replace=True))

    # Drop the strata_label column from the sampled dataframe
    sampled_df = sampled_df.drop(columns='strata_label')

    return sampled_df

Rows per month: year_month
2020-01    2976
2020-02    2699
2020-03    2976
2020-04    2756
2020-05    1581
2020-06    2067
2020-07    2147
2020-08    2098
2020-09    1522
2020-10     200
2020-11     985
2020-12    1157
2021-01    2361
2021-02    2688
2021-03    2976
2021-04    2880
2021-05     457
2022-02     584
2022-03    2029
2023-05    2195
2023-06    2817
2023-07    2221
2023-08    2884
2023-09    2796
2023-10    2757
2023-11    2354
2023-12    2549
2024-01    1969
2024-02    2519
2024-03    2698
2024-04    1885
2024-05    2804
2024-06    2317
2024-07    2495
2024-08    1220
2024-09     459
Freq: M, Name: count, dtype: int64

Gap summary for train set:
Gap before 2020-03-01 00:00:00: 29 days 00:15:00 (from 2020-01-31 23:45:00 to 2020-03-01 00:00:00)
Gap before 2020-04-29 23:00:00: 0 days 08:15:00 (from 2020-04-29 14:45:00 to 2020-04-29 23:00:00)
Gap before 2020-04-30 23:00:00: 0 days 23:15:00 (from 2020-04-29 23:45:00 to 2020-04-30 23:00:00)
Gap before 2020-05-01 23:00:00: 0 days 

In [26]:
def preprocess_data(flowdata_df):
    window_size = hp.TOTAL_WINDOW

    datasets = []
    strata = []


    train_df, val_df, test_df = month_based_train_val_test_split(flowdata_df, hp.TRAIN_VAL_TEST_SPLIT)
    dfs = [train_df, val_df, test_df]
    overlap = [True, True, False]

    set_names = ['train', 'val', 'test']
    results = {name: {} for name in set_names}

    for i, df in enumerate(dfs):
        # Actual code
        strata_df = assign_strata(df)
        samples_df, samples_strata_df = create_samples(df, strata_df, window_size, set_name, overlap=overlap[i])
        #strat_sampled_df = strat_random_sample(samples_df, samples_strata_df)

        datasets.append(samples_df)
        strata.append(samples_strata_df)

        # Stuff for printing output
        #gap_mask = df.index.to_series().diff() > pd.Timedelta(minutes=15)
        #segment_ids = gap_mask.cumsum()
        #num_segments = segment_ids.nunique()
        #results[set_names[i]]['num_segments'] = num_segments
        #results[set_names[i]]['samples_df'] = samples_df
        #results[set_names[i]]['strata_df'] = strata_df
        #results[set_names[i]]['samples_strata_df'] = samples_strata_df
        #results[set_names[i]]['strat_sampled_df'] = strat_sampled_df


    """
    # --- Reporting ---
    print('--- DATASET CHECKS ---')
    # How many rows are in each month
    df_month_strata = pd.DataFrame(index=flowdata_df.index)
    df_month_strata['year_month'] = flowdata_df['timestamp'].dt.to_period('M')
    print('Rows per month:')
    print(df_month_strata['year_month'].value_counts().sort_index())

    # How many rows are in each train/val/test set
    for name, df in zip(set_names, dfs):
        print(f'Rows in {name} set:', len(df))

    for name in set_names:
        # How many segments are in each train/val/test set
        print(f'Segments in {name} set:', results[name]['num_segments'])

        # How many samples are created from each train/val/test set
        samples_df = results[name]['samples_df']
        print(f'Samples created from {name} set:', len(samples_df))

        # What are the start times of the samples (first 10)
        if len(samples_df) > 0:
            print(f'Sample start times in {name} set (first 10):', samples_df.index[:10])

        # How many samples are in each strata
        samples_strata_df = results[name]['samples_strata_df']
        if 'strata' in samples_strata_df.columns:
            print(f'Samples per strata in {name} set:')
            print(strata_df['strata'].value_counts())

        # How many samples are in each subset after stratified sampling
        strat_sampled_df = results[name]['strat_sampled_df']
        print(f'Samples after stratified sampling in {name} set:', len(strat_sampled_df))

        # How many unique samples are in each subset after stratified sampling (to check for duplicates)
        unique_samples = strat_sampled_df.drop_duplicates()
        print(f'Unique samples after stratified sampling in {name} set:', len(unique_samples))

        # How many samples are in each strata after stratified sampling (to check for balance)
        if 'strata' in strat_sampled_df.columns:
            print(f'Samples per strata after stratified sampling in {name} set:')
            print(strat_sampled_df['strata'].value_counts())
            """

    return datasets, strata

datasets, strata = preprocess_data(flowdata_df)