In [1]:
import pandas as pd
import torch
import numpy as np
import networkx as nx
from scipy.spatial.distance import euclidean
import torch
from sklearn.model_selection import train_test_split
import hyperparameters as hp
import random

pd.set_option('display.max_rows', 500)

path = hp.FLOWDATA_PATH

In [2]:
def find_long_nan_sections(df, max_gap):
    # create a mask for all of the rows with missing values
    missing_vals = df.isna()
    
    prev_row_missing_vals = missing_vals.shift()

    # find the rows where the value of a sensor changes from nan > value, or value > nan
    transition_rows = missing_vals != prev_row_missing_vals

    # assign an id number to each block of vals
    block_ids = transition_rows.cumsum()

    # find the length of each gap
    gap_lengths = missing_vals.groupby(block_ids).transform('sum')

    # identify all gaps which are longer than 4 hours
    long_gaps = missing_vals & (gap_lengths > max_gap)

    return long_gaps

In [4]:
# Reading in the flow data file
flowdata_df = pd.read_csv(path, index_col=0)
flowdata_df.index = pd.to_datetime(flowdata_df.index, format='%d/%m/%Y %H:%M')

# Removing a sensor with a large number of missing values
flowdata_df = flowdata_df.drop('1615', axis=1)

flowdata_df = flowdata_df.rename(columns=hp.SENSOR_DMA_TO_ID)
flowdata_df = flowdata_df.sort_index(axis=1)

# Removing rows which have outliers or are part of long sections of missing values
rows_to_remove = pd.Series(False, index=flowdata_df.index) 
for col in flowdata_df.columns:
    rows_to_remove |= find_long_nan_sections(flowdata_df[col], hp.MAX_GAP)
    #rows_to_remove |= find_outlier_values(flowdata_df[col])
flowdata_df = flowdata_df[rows_to_remove == False]

# Imputing short ranges of missing values
flowdata_df = flowdata_df.interpolate(method='spline', order = 3)

In [None]:
def month_based_train_val_test_split(flowdata_df, train_val_test_ratios):
    # get all the months
    df_month_strata = pd.DataFrame(index=flowdata_df.index)
    df_month_strata['year_month'] = flowdata_df['timestamp'].dt.to_period('M')

    months = df_month_strata['year_month'].unique()
    random.shuffle(months)

    # assign each month to a set
    num_months = df_month_strata['year_month'].nunique()
    train_ratio, val_ratio, test_ratio = train_val_test_ratios

    num_months_train = int(num_months * train_ratio)
    num_months_val = int(num_months * val_ratio)
    num_months_test = num_months - num_months_train - num_months_val

    train_months = months[:num_months_train]
    val_months = months[num_months_train:num_months_train + num_months_val]
    test_months = months[num_months_train + num_months_val:]

    # split the data into the sets
    train_df = flowdata_df[df_month_strata['year_month'].isin(train_months)]
    val_df = flowdata_df[df_month_strata['year_month'].isin(val_months)]
    test_df = flowdata_df[df_month_strata['year_month'].isin(test_months)]

    return train_df, val_df, test_df

In [None]:
def assign_strata(df):
    """
    Takes a timeseries dataframe, and creates a corresponding series specificying each row's strata.
    """

    strata_dict = {
        'time_of_day': {
            'feature_origin': df.index.hour,
            'bins': [0, 6, 12, 14, 18, 22, 24],  
            'labels': ['night', 'morning', 'midday', 'afternoon', 'evening', 'night']
        },
        'part_of_week': {
            'feature_origin': df.index.dayofweek,
            'bins': [0, 5, 7], 
            'labels': ['weekday', 'weekend']
        },
        'season': {
            'feature_origin': df.index.month,
            'bins': [0, 3, 6, 9, 12, 13], 
            'labels': ['winter', 'spring', 'summer', 'autumn', 'winter']
        }
    }

    strata_df = pd.DataFrame(index=df.index)

    for strata_name, strata_info in strata_dict.items():
        strata_df[strata_name] = pd.cut(
            strata_info['feature_origin'],
            bins=strata_info['bins'],
            labels=strata_info['labels'],
            right=False,  
            include_lowest=True,
            ordered=False
        )

    strata_df['strata'] = strata_df['part_of_week'].astype(str) + '_' + strata_df['season'].astype(str)

    return strata_df

In [None]:
def create_samples(df, df_strata, sample_length, overlap):
    """
    Creates samples of the data with a given length and overlap.

    Note: 
    - Currently all samples will start at the same time of day, as each one is 5 days long
    - This would make the model struggle with differently timed inputs
    - I will need to eventually add random starting points for the samples, but for now I will just create the samples with a fixed starting point to test the model
    """

    samples_df = pd.DataFrame(columns=hp.SENSOR_COLS)

    # Find all the gaps in the data (where there are missing time steps / it is not continuous)
    gap_mask = df.index.to_series().diff() > pd.Timedelta(minutes=15)

    # Split the data into all of the continous segments
    df['segment_id'] = gap_mask.cumsum()
    segments_df = df.groupby('segment_id')

    # Split the data into all of the continuous segments and iterate through each segment
    for _, segment in segments_df:

        # Get rid of the segment_id column
        segment = segment.drop(columns='segment_id')

        # Get the sensor values as a numpy array for easier indexing
        sensor_values = segment[hp.SENSOR_COLS].values

        if overlap:
            step = sample_length / 2
        else:
            step = sample_length

        i = 0
        while i + sample_length <= len(segment):
            row = {
                col: sensor_values[i:i + sample_length, idx]
                for idx, col in enumerate(hp.SENSOR_COLS)
            }
            index = len(samples_df)
            samples_df.loc[index] = row
            i += step
    
    return samples_df, df_strata

In [None]:
def strat_random_sample(samples_df, strata_df):
    # Merge the samples_df with the strata_df to get the strata labels for each sample
    merged_df = samples_df.merge(strata_df, left_index=True, right_index=True)

    # Perform stratified random sampling
    sampled_df = merged_df.groupby('strata_label', group_keys=False).apply(lambda x: x.sample(hp.SAMPLE_SIZE, replace=True))

    # Drop the strata_label column from the sampled dataframe
    sampled_df = sampled_df.drop(columns='strata_label')

    return sampled_df

In [None]:
def preprocess_data(flowdata_df):
    window_size = hp.TOTAL_WINDOW

    datasets = []
    strata = []

    train_df, val_df, test_df = month_based_train_val_test_split(flowdata_df, hp.TRAIN_VAL_TEST_SPLIT)
    dfs = [train_df, val_df, test_df]
    overlap = [True, True, False]

    for i, df in enumerate(dfs):
        strata_df = assign_strata(df)
        samples_df, samples_strata_df = create_samples(df, strata_df, window_size, overlap=overlap[i])
        strat_sampled_df = strat_random_sample(samples_df, strata_df)
        datasets.append(strat_sampled_df)
        strata.append(strata_df)

    return datasets, strata

datasets, strata = preprocess_data(flowdata_df)