In [1]:
import pandas as pd

from tsfresh.utilities.dataframe_functions import roll_time_series

In [2]:
def map_timeId(datetime: pd.Series) -> pd.Series:
    """
    Maps datetime to an integer time_id
    :param datetime: pandas Series with input datetime
    :return: pandas Series integer index of increasing time
    """
    dates = [str(i) for i in sorted(datetime.unique())]
    ids = list(range(1, len(dates) + 1))
    dict_map = {dates[key]: ids[key] for key in range(len(dates))}
    return datetime.map(lambda x: dict_map[str(x)])

In [3]:
df = pd.read_csv('../timeseries_toolkit/tests/fixtures/test_ts_data.csv')

feature_win = 5
forecast_horizon = 3
target_col = 'target'

df['datetime'] = pd.to_datetime(df['datetime'], format='%d/%m/%Y').dt.date
df['timeID'] = map_timeId(df['datetime'])
df['kind'] = df[id_col]
df = df.sort_values(by=[id_col, 'datetime'])
# df['target_shift'] = df.groupby(id_col)[target_col].shift(-forecast_horizon)

In [4]:
df_target = df[[id_col, 'datetime', 'target']]

df_target['target_shift'] = df_target.groupby(id_col)['target'].shift(-forecast_horizon)

df_target = df_target.rename(columns={'datetime': 'ref_date'})

df_target.drop('target', 1, inplace=True)

# create sub time series windows 
df_rolled = roll_time_series(df, column_id=id_col,column_sort='timeID',
                             column_kind='kind', rolling_direction=1, 
                             max_timeshift=feature_win-1)

df_rolled = df_rolled.rename(columns={id_col: 'winID', 'kind': id_col})

cols = list(df_rolled.columns.values)
first_cols = [id_col, 'winID', 'timeID', 'datetime']

remaining_cols = sorted(list(set(cols) - set(first_cols)))

cols = first_cols + remaining_cols

df_rolled = df_rolled[cols].sort_values(by=[id_col, 'winID', 'timeID']).reset_index(drop=True)

df_rolled['ref_date'] = df_rolled.groupby([id_col, 'winID'])['datetime'].transform('last')

df_rolled_full = pd.merge(df_rolled, df_target, how='left', on=[id_col, 'ref_date'])

df_rolled_full = df_rolled_full[df_rolled_full.groupby([id_col, 'winID'])['timeID'].transform(len)==feature_win]

df_rolled_full.dropna(subset=['target_shift'], inplace=True)

cols = list(df_rolled_full.columns)

first_cols = [id_col, 'ref_date', 'winID', 'datetime', 'timeID', 'target_shift']
remaining_cols = list(set(cols) - set(first_cols))
cols = first_cols + sorted(remaining_cols)

df_rolled_full = df_rolled_full[cols]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
df_rolled_full

Unnamed: 0,id,ref_date,winID,datetime,timeID,target_shift,feature_1,feature_2,feature_3,feature_4,target
10,AXP,2017-01-09,5,2017-01-03,1.0,76.88,74.89,75.75,74.74,5853881.0,75.35
11,AXP,2017-01-09,5,2017-01-04,2.0,76.88,75.26,76.55,75.06,4635796.0,76.26
12,AXP,2017-01-09,5,2017-01-05,3.0,76.88,76.00,76.18,74.82,3383003.0,75.32
13,AXP,2017-01-09,5,2017-01-06,4.0,76.88,75.40,75.92,75.06,3089593.0,75.47
14,AXP,2017-01-09,5,2017-01-09,5.0,76.88,76.14,76.50,75.53,4818703.0,75.86
15,AXP,2017-01-10,6,2017-01-04,2.0,76.62,75.26,76.55,75.06,4635796.0,76.26
16,AXP,2017-01-10,6,2017-01-05,3.0,76.62,76.00,76.18,74.82,3383003.0,75.32
17,AXP,2017-01-10,6,2017-01-06,4.0,76.62,75.40,75.92,75.06,3089593.0,75.47
18,AXP,2017-01-10,6,2017-01-09,5.0,76.62,76.14,76.50,75.53,4818703.0,75.86
19,AXP,2017-01-10,6,2017-01-10,6.0,76.62,76.52,78.00,76.12,9561414.0,76.65


In [None]:
aggregations = {'showroom_event': {'sum': sum, 'count_stuff': 'count'},
                'car_show':sum,
                'prius_promotion':sum,
                'spade_promotion':sum,
                'noah_promotion':sum,
                'alphard_promotion':sum,
                'sienta_promotion':sum,
                'vellfire_promotion':sum,
                'ractis_promotion':sum,
                'new_user_unique_visitors':'mean',
                'engagement_>_3_mins':'mean',
                'production':'mean',
                'show room footfall': 'mean',
                'telephone enquiries':'mean',
                'internet enquiries':'mean',
                'leads':'mean',
                'test drives':'mean'
               }

df_aggregated = df_rolled.groupby(['parentmodelcode', 'ref_date']).agg(aggregations)

df_aggregated.reset_index(inplace=True)

# Rename columns
df_aggregated.columns = [i[0] + '_' + i[1] if len(i)==2 else i for i in df_aggregated.columns ]

df_aggregated.columns = [i[:-1] if i[-1]=='_' else i for i in df_aggregated.columns]

In [None]:
def ml_preproc(df, forecast_horizon, feature_win, tsfresh_list):

    assert len(df) >= forecast_horizon + feature_win

    # tsfresh-compliant pre-processing
    df = pd.melt(df.reset_index(), id_vars=['index'], value_vars=df.columns).rename(columns={'index': 'time', 
                                                                                             'variable': id_col})
    df['kind'] = df[id_col]
    df = df[[id_col, 'time', 'kind', 'value']]
    # tsfresh rolling window transformation
    df = roll_time_series(df_or_dict=df, column_id=id_col, column_sort='time', 
                     column_kind='kind', rolling_direction=1, max_timeshift=feature_win+forecast_horizon)
    df = df.sort_values(by=['kind', id_col])
    # Extract target variable from transformed data
    targets = df[df.time == df.id]
    targets[id_col] = targets.apply(lambda x: x['kind'] + '_' + str(x[id_col]),1)
    targets.drop(['time', 'kind'], 1, inplace=True)
    targets.rename(columns={'value': 'target'}, inplace=True)
    # Filter rolling window df to contain only desirable dates
    df['months_diff'] = np.round((df.id - df.time)/ np.timedelta64(1, 'M'))
    df = df[df.months_diff >= forecast_horizon]
    df = df[df.months_diff <= (feature_win + forecast_horizon - 1)]
    df.drop('months_diff', 1, inplace=True)
    # More tsfresh-related preprocessing
    df[id_col] = df.apply(lambda x: x['kind'] + '_' + str(x[id_col]),1)
    df['kind'] = 'Volume'
    # Only keep windows that have total observations equal to feature_win
    df['size'] = df.groupby(id_col)['kind'].transform('size')
    df = df[df['size'] >= feature_win]
    df.drop('size',1, inplace=True)
    # Apply within timeseries minmax scaling and store fitted scalers
    scaling = pd.DataFrame()
    scaling['scaling_min'] = df.groupby(id_col)['value'].min()
    scaling['scaling_max'] = df.groupby(id_col)['value'].max()
    scaling = scaling.reset_index()
    df['value'] = df.groupby(id_col)['value'].apply(lambda x: (x - x.min())/(x.max() - x.min()))
    df['value'] = df['value'].fillna(0)
    # Apply tsfresh feature extraction on pre-processed rolling window data
    features = extract_features(df, column_id="id", column_sort="time", column_value="value", column_kind='kind', 
                               default_fc_parameters=tsfresh_list)
    features = features.reset_index()
    features['index'] = features.id.map(lambda x: x.split('_')[0])
    # Incorporate scaler info into feature space
    features = features.merge(scaling, how='left')
    features.drop('index', 1, inplace=True)
    # Also use last observed timeseries value as a feature
    features['last_value'] = df.groupby(id_col)['value'].last().tolist()
    # Reject tsfresh features that are constant
    features = features.loc[:, features.apply(pd.Series.nunique) != 1] 
    # nan/inf value handling
    features.replace([np.inf, -np.inf], np.nan, inplace=True)
    features.fillna(0, inplace=True)
    features.set_index(id_col, inplace=True)
    # Ensure targets map 1-1 with features
    targets = targets[targets[id_col].isin(features.index)]
    # Apply fitted scalers to target timeseries values (done this way to avoid leakage) 
    targets = targets.merge(scaling, how='left')
    targets['target'] = (targets['target'] - targets['scaling_min'])/(targets['scaling_max'] - targets['scaling_min'])
    targets['target'] = targets['target'].replace(np.inf, np.nan).fillna(0)
    targets.set_index(id_col, inplace=True)
    targets = targets[['target']]
    assert(len(features)==len(targets))
    
    return features, targets