In [1]:
import numpy as np
import pandas as pd
from datetime import timedelta
import plotly.express as px

In [2]:
def preprocess_data(df: pd.DataFrame)-> pd.DataFrame:
    """
    Takes a CSV from querying DynamoDB and preprocesses the data to the 
    required format for input to training. 

    Args: 
        df: a dataframe of readings, including temperature and humidity.
    
    """
    df.rename(columns={'humidity.S': 'humidity',
                   'temperature.S':'temperature',
                   'timestamp.S':'timestamp'},inplace=True)
    
    # Convert the timestamp column to datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')

    # Round the timestamp to the nearest minute
    df['timestamp'] = df['timestamp'].dt.round('1min')

    # Remove unnecessary columns
    df.drop(columns=['Unnamed: 0', 'humidity'],inplace=True)
    df.sort_values(by='timestamp')
    df = df[ df['timestamp'] > '2023-04-28' ]
    df.set_index('timestamp', inplace=True)
    df.dropna(inplace=True)

    return pd.DataFrame(df)

In [37]:
def augment_missing_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Augments the data by looking for gaps of over 15 minutes, then inserting past
    temperatures from 24 hours ago. 

    Args:
        df: a chronologically sorted dataframe of temperature readings

    Returns:
        df: a chronoligically sorted dataframe of temperature readings, augmented
        to input missing data with the value from 24 hours previous. 
    """

    print(f'Dataframe size before augmentation: {df.shape}')

    df_original_shape = df.shape
    one_day = 10 * 24
    time_interval = timedelta(minutes=10)
    i = one_day 

    while i < df.shape[0]-1: 

        current_time = pd.Timestamp(df.index[i])
        next_time = pd.Timestamp(df.index[i + 1])
        
        if (next_time - current_time) > time_interval + timedelta(minutes=15):
            
            previous_value_temp = df.iloc[i+1-one_day]['temperature']    
            new_row = pd.DataFrame({'temperature': previous_value_temp}, index=[pd.Timestamp(current_time + time_interval)])      
            df = pd.concat([df.iloc[:i+1], new_row, df.iloc[i+1:]])

        i += 1

    print(f'Dataframe size after empty rows added: {df.shape}')
    print(f'Rows added, {df.shape[0]-df_original_shape[0]},' 
          f'or {np.round((df.shape[0]-df_original_shape[0])*100/df.shape[0],2)}% of the new total.')

    return df


In [38]:
def train_val_test_split(df: pd.DataFrame)-> pd.DataFrame:
    """
    Takes a preprocessed dataframe, assumed to be sorted chronologically,
    and returns three dataframes split into train, validation, and test; 
    done chronologically in a 60%/20%/20% split. 

    Args: 
        df: a DataFrame

    Returns:
        df_train, df_test, df_val: a tuple of the train, test, and validation 
        data.
    """

    train_index = int(np.round(df.shape[0]*0.6))
    val_index = int(np.round(df.shape[0]*0.8))

    df_train = df.iloc[:train_index,]
    df_val = df.iloc[train_index:val_index]
    df_test = df.iloc[val_index:]

    return df_train, df_test, df_val

In [1]:
def run_preprocessing_pipeline(csv_input_path: str)-> list:


    df_raw = pd.read_csv(csv_input_path)

    df_preprocessed = preprocess_data(df_raw)
    df_augmented = augment_missing_data(df_preprocessed)
    df_train, df_test, df_val = train_val_test_split(df_augmented)



# run_preprocessing_pipeline('analysis/ddb_output.csv')