# Feature extractor

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Callable

In [2]:
def compute_timedelta_ago_load(df: pd.DataFrame, timedelta: pd.Timedelta) -> pd.Series:
    """For each timestamps in the index, compute the load timedelta ago 

    Assume that each row's index is the current timestamp.
    That is, when we say "timedelta ago from now", we mean "timedelta ago from this timestamp".

    df (pd.DataFrame): Dataframe containing the `24h_later_load`, whose index refers to now when saying "24h later".
    timedelta (pd.Timedelta): Time delta of interest, i.e. how long ago do we want the load ?
    """
    
    assert '24h_later_load' in df.columns
    assert isinstance(df.index, pd.DatetimeIndex)

    ts_to_24h_later_load = df['24h_later_load'].to_dict()
    return df.index.to_series().apply(lambda x: ts_to_24h_later_load.get(x - pd.Timedelta(24, 'h') - timedelta))

In [3]:
def compute_stat(df: pd.DataFrame, timedelta: pd.Timedelta, stat: Callable) -> pd.Series:
    """For each timestamps in the index, compute each statistic in `stats` over the date comprised between now and timedelta ago. 

    Assume that each row's index is the current timestamp.
    That is, when we say "timedelta ago from now", we mean "timedelta ago from this timestamp".

    df (pd.DataFrame): Dataframe containing the `24h_later_load`, whose index refers to now when saying "24h later".
    timedelta (pd.Timedelta): Time delta of interest, i.e. how long ago do we want the statistics calculation to start ?
    stats (list[func]): Functions of the statistic to compute
    """

    assert '24h_later_load' in df.columns
    assert isinstance(df.index, pd.DatetimeIndex)

    def _compute_stat(current_time, timedelta, stat):
        start_time = current_time -  pd.Timedelta(24, 'h') - timedelta
        end_time = current_time - pd.Timedelta(24, 'h') 
        
        relevant_data = df.loc[start_time:end_time, '24h_later_load']
    
        if len(relevant_data) == 0:
            return np.nan
    
        return stat(relevant_data.values)

    return df.index.to_series().apply(lambda x: _compute_stat(x, timedelta, stat))

In [4]:
# Load data
df = pd.read_parquet('../data/silver/df.parquet', columns=['24h_later_load'])
df.head(3)

Unnamed: 0_level_0,24h_later_load
datetime,Unnamed: 1_level_1
2014-12-14 00:00:00,6131
2014-12-14 01:00:00,5842
2014-12-14 02:00:00,5715


In [5]:
# Enrich the df with the datetime attributes
df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['hour'] = df.index.hour
df['weekday'] = df.index.weekday

# Enrich each row with previous loads: 1h-ago, 2h-ago, 3h-ago, 24h-ago, 7days-ago
df['1h_ago_load'] = compute_timedelta_ago_load(df, timedelta=pd.Timedelta(1, 'h'))
df['2h_ago_load'] = compute_timedelta_ago_load(df, timedelta=pd.Timedelta(2, 'h'))
df['3h_ago_load'] = compute_timedelta_ago_load(df, timedelta=pd.Timedelta(3, 'h'))
df['24h_ago_load'] = compute_timedelta_ago_load(df, timedelta=pd.Timedelta(24, 'h'))
df['7d_ago_load'] = compute_timedelta_ago_load(df, timedelta=pd.Timedelta(7, 'd'))

# Enrich the df with statistics
df['8h_min'] = compute_stat(df, pd.Timedelta(8, 'h'), np.min)
df['8h_max'] = compute_stat(df, pd.Timedelta(8, 'h'), np.max)
df['8h_median'] = compute_stat(df, pd.Timedelta(8, 'h'), np.median)

df['24h_min'] = compute_stat(df, pd.Timedelta(24, 'h'), np.min)
df['24h_max'] = compute_stat(df, pd.Timedelta(24, 'h'), np.max)
df['24h_median'] = compute_stat(df, pd.Timedelta(24, 'h'), np.median)

df['7d_min'] = compute_stat(df, pd.Timedelta(7, 'd'), np.min)
df['7d_max'] = compute_stat(df, pd.Timedelta(7, 'd'), np.max)
df['7d_median'] = compute_stat(df, pd.Timedelta(7, 'd'), np.median)

# Drop times for which we could not compute a time
df = df.dropna()

In [6]:
df.to_parquet('../data/gold/df.parquet')