In [12]:
import numpy as np
import pandas as pd
from datetime import timedelta

In [6]:
def preprocess_data(df: pd.DataFrame)-> pd.DataFrame:
    """
    Takes a CSV from querying DynamoDB and preprocesses the data to the 
    required format for input to training. 

    Args: 
        df: a dataframe of readings, including temperature and humidity.
    
    """
    df.rename(columns={'humidity.S': 'humidity',
                   'temperature.S':'temperature',
                   'timestamp.S':'timestamp'},inplace=True)
    
    # Convert the timestamp column to datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')

    # Round the timestamp to the nearest minute
    df['timestamp'] = df['timestamp'].dt.round('1min')

    # Remove unnecessary columns
    df.drop(columns=['Unnamed: 0', 'humidity'],inplace=True)
    df.sort_values(by='timestamp')
    df = df[ df['timestamp'] > '2023-04-28' ]
    df.set_index('timestamp', inplace=True)
    df.dropna(inplace=True)

    return pd.DataFrame(df)

In [53]:
def augment_missing_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Takes a dataframe, with some missing values due to connection issues. 
    Augments the data by inserting rows
    """

    print(f'Dataframe size before augmentation: {df.shape}')

    time_interval = timedelta(minutes=10)

    i = 0 

    while i < df.shape[0]-1: 

        current_time = pd.Timestamp(df.index[i])
        next_time = pd.Timestamp(df.index[i + 1])
        
        # Check if the time interval between current and next timestamp is much longer than we expect
        if (next_time - current_time) > time_interval + timedelta(minutes=5):
            
            # Insert a new row with NA values and a timestamp 10 minutes after the current timestamp
            new_row = pd.DataFrame({'temperature': np.nan}, index=[pd.Timestamp(current_time + time_interval)])
        
            # Concatenate the new row to the dataframe
            df = pd.concat([df.iloc[:i+1], new_row, df.iloc[i+1:]])

        i += 1

    print(f'Dataframe size after empty rows added: {df.shape}')



In [40]:
def train_val_test_split(df: pd.DataFrame)-> pd.DataFrame:
    """
    Takes a preprocessed dataframe, assumed to be sorted chronologically,
    and returns three dataframes split into train, validation, and test; 
    done chronologically in a 60%/20%/20% split. 

    Args: 
        df: a DataFrame

    Returns:
        df_train, df_test, df_val: a tuple of the train, test, and validation 
        data.
    """

    train_index = int(np.round(df.shape[0]*0.6))
    val_index = int(np.round(df.shape[0]*0.8))

    df_train = df_temp.iloc[:train_index,]
    df_val = df_temp.iloc[train_index:val_index]
    df_test = df_temp.iloc[val_index:]

    return df_train, df_test, df_val

In [46]:
def test_while():

    i = 0

    while i < 10:
        print('Mouse')
        i +=1 
    
test_while()

Mouse
Mouse
Mouse
Mouse
Mouse
Mouse
Mouse
Mouse
Mouse
Mouse


In [41]:
df_raw = pd.read_csv('analysis/ddb_output.csv')

df_preprocessed = preprocess_data(df_raw)

df_preprocessed.head(5)

Unnamed: 0_level_0,temperature
timestamp,Unnamed: 1_level_1
2023-04-28 08:25:00,17.83
2023-04-28 08:26:00,17.85
2023-04-28 08:28:00,17.85
2023-04-28 08:32:00,17.88
2023-04-28 09:18:00,17.88


In [50]:
new_row = pd.DataFrame({'temperature': np.nan}, index=[pd.Timestamp('2023-05-05')])

pd.concat([df_preprocessed.iloc[:5], new_row])



Unnamed: 0,temperature
2023-04-28 08:25:00,17.83
2023-04-28 08:26:00,17.85
2023-04-28 08:28:00,17.85
2023-04-28 08:32:00,17.88
2023-04-28 09:18:00,17.88
2023-05-05 00:00:00,


In [47]:
df_preprocessed.shape[0]

16134

In [54]:
df_augmented = augment_missing_data(df_preprocessed)

Dataframe size before augmentation: (16134, 1)
Dataframe size after empty rows added: (18011, 1)


In [32]:
pd.Timestamp(df_preprocessed.index[2])

Timestamp('2023-04-28 08:28:00')

In [31]:
df_preprocessed.index[3]

Timestamp('2023-04-28 08:32:00')