# Vanilla

In [2]:
import pandas as pd
import os
from tsfresh import extract_relevant_features
from tsfresh.examples.robot_execution_failures import (
    download_robot_execution_failures, load_robot_execution_failures
)

os.environ["http_proxy"] = "http://127.0.0.1:10808"
os.environ["https_proxy"] = "http://127.0.0.1:10808"
download_robot_execution_failures()

timeseries, y = load_robot_execution_failures()

In [5]:
timeseries

Unnamed: 0,id,time,F_x,F_y,F_z,T_x,T_y,T_z
0,1,0,-1,-1,63,-3,-1,0
1,1,1,0,0,62,-3,-1,0
2,1,2,-1,-1,61,-3,0,0
3,1,3,-1,-1,63,-2,-1,0
4,1,4,-1,-1,63,-3,-1,0
...,...,...,...,...,...,...,...,...
1315,88,10,-10,2,39,-21,-24,5
1316,88,11,-11,2,38,-24,-22,6
1317,88,12,-12,3,23,-24,-24,5
1318,88,13,-13,4,26,-29,-27,5


In [39]:
type(y)

pandas.core.series.Series

In [6]:
y

1      True
2      True
3      True
4      True
5      True
      ...  
84    False
85    False
86    False
87    False
88    False
Length: 88, dtype: bool

In [7]:
features = extract_relevant_features(timeseries, y, column_id="id", column_sort="time")

Feature Extraction: 100%|██████████| 38/38 [00:02<00:00, 14.13it/s]


In [8]:
features

Unnamed: 0,F_x__value_count__value_-1,F_x__abs_energy,F_x__root_mean_square,T_y__absolute_maximum,F_x__mean_n_absolute_max__number_of_maxima_7,F_x__range_count__max_1__min_-1,F_y__root_mean_square,F_y__abs_energy,F_y__mean_n_absolute_max__number_of_maxima_7,T_y__standard_deviation,...,"T_x__change_quantiles__f_agg_""var""__isabs_True__qh_0.2__ql_0.0","F_z__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.8",T_x__quantile__q_0.1,F_y__has_duplicate_max,T_y__lempel_ziv_complexity__bins_3,T_y__quantile__q_0.1,F_z__time_reversal_asymmetry_statistic__lag_1,F_x__quantile__q_0.2,F_y__quantile__q_0.7,"T_x__change_quantiles__f_agg_""var""__isabs_False__qh_0.2__ql_0.0"
1,14.0,14.0,0.966092,1.0,1.000000,15.0,0.930949,13.0,1.000000,0.471405,...,0.000000,0.0,-3.0,1.0,0.400000,-1.0,-5.960000e+02,-1.0,-1.0,0.000000
2,7.0,25.0,1.290994,5.0,1.571429,13.0,2.250926,76.0,3.000000,2.054805,...,0.000000,1.0,-9.2,1.0,0.533333,-3.6,-6.803846e+02,-1.0,-1.0,0.000000
3,11.0,12.0,0.894427,5.0,1.000000,14.0,1.632993,40.0,2.142857,1.768867,...,0.000000,3.0,-6.6,0.0,0.533333,-4.0,-6.170000e+02,-1.0,0.0,0.000000
4,5.0,16.0,1.032796,6.0,1.285714,10.0,2.000000,60.0,2.428571,2.669998,...,0.000000,0.0,-9.0,0.0,0.533333,-4.6,3.426308e+03,-1.0,1.0,0.000000
5,9.0,17.0,1.064581,5.0,1.285714,13.0,1.751190,46.0,2.285714,2.039608,...,0.000000,0.0,-9.6,0.0,0.466667,-5.0,-2.609000e+03,-1.0,0.8,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,0.0,96833.0,80.346334,167.0,105.285714,0.0,53.404120,42780.0,71.428571,39.541483,...,64.000000,46.0,203.2,0.0,0.533333,36.4,-7.700628e+07,-105.0,66.8,64.000000
85,0.0,1683.0,10.592450,14.0,13.714286,0.0,10.076375,1523.0,12.142857,3.841296,...,4.666667,4.5,-41.6,0.0,0.466667,1.0,-1.050785e+04,5.8,10.6,13.555556
86,0.0,83497.0,74.608757,191.0,98.142857,0.0,37.473546,21064.0,47.714286,52.807154,...,0.250000,7.0,-84.8,0.0,0.466667,19.6,-5.544922e+06,30.4,38.4,0.250000
87,0.0,1405437.0,306.097697,471.0,340.000000,0.0,143.447551,308658.0,157.285714,80.098162,...,0.000000,90.5,-139.2,0.0,0.466667,272.6,-9.881845e+07,246.8,154.8,0.000000


# Using Stock Data

In [17]:
import os
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from tsfresh import extract_relevant_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame, roll_time_series

In [4]:
load_dotenv()  # take environment variables from .env.

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
db_url = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

alchemyEngine = create_engine(
    db_url,
    pool_recycle=3600,
)

In [67]:
# query = """
# with cte as (
# SELECT "date", "open", "close", high, low, volume, amount, open_preclose_rate, high_preclose_rate, low_preclose_rate, vol_change_rate, amt_change_rate, change_rate
# FROM index_daily_em_view
# where symbol = '399673'
# order by date desc
# limit 1200
# ) select * from cte order by date
# """
query = """
with cte as (
SELECT "date", "open", "close", high, low, volume, amount, open_preclose_rate, high_preclose_rate, low_preclose_rate, vol_change_rate, amt_change_rate, change_rate
FROM index_daily_em_view 
where symbol = '000688'
and change_rate is not null
) select * from cte order by date
"""

raw_df = pd.read_sql(query, alchemyEngine, parse_dates=["date"])

In [69]:
df = raw_df.rename(columns={"date": "ds", "change_rate": "y"})
df.insert(0, "unique_id", "000688")

In [70]:
df[["ds", "y"]]

Unnamed: 0,ds,y
0,2020-01-02,1.97200
1,2020-01-03,0.46679
2,2020-01-06,0.27331
3,2020-01-07,1.53804
4,2020-01-08,0.77463
...,...,...
1245,2025-02-26,2.16883
1246,2025-02-27,-0.04704
1247,2025-02-28,-4.21650
1248,2025-03-03,-1.52484


In [64]:
rts = roll_time_series(df[:-1], column_id="unique_id", column_sort="ds", max_timeshift=20)

Rolling: 100%|██████████| 20/20 [00:27<00:00,  1.38s/it]


In [65]:
len(rts)

54432

In [66]:
rts

Unnamed: 0,unique_id,ds,open,close,high,low,volume,amount,open_preclose_rate,high_preclose_rate,low_preclose_rate,vol_change_rate,amt_change_rate,y,id
0,399673,2014-06-19,1327.54,1281.12,1335.44,1269.53,4532795.0,9.384841e+09,-0.09783,0.49667,-4.46329,15.08537,11.23805,-3.59110,"(399673, 2014-06-19 00:00:00)"
1,399673,2014-06-19,1327.54,1281.12,1335.44,1269.53,4532795.0,9.384841e+09,-0.09783,0.49667,-4.46329,15.08537,11.23805,-3.59110,"(399673, 2014-06-20 00:00:00)"
2,399673,2014-06-20,1281.14,1294.98,1297.99,1273.18,3111206.0,6.570250e+09,0.00156,1.31682,-0.61977,-31.36231,-29.99082,1.08187,"(399673, 2014-06-20 00:00:00)"
3,399673,2014-06-19,1327.54,1281.12,1335.44,1269.53,4532795.0,9.384841e+09,-0.09783,0.49667,-4.46329,15.08537,11.23805,-3.59110,"(399673, 2014-06-23 00:00:00)"
4,399673,2014-06-20,1281.14,1294.98,1297.99,1273.18,3111206.0,6.570250e+09,0.00156,1.31682,-0.61977,-31.36231,-29.99082,1.08187,"(399673, 2014-06-23 00:00:00)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54427,399673,2025-02-25,2209.17,2221.68,2248.58,2207.44,22077627.0,1.002814e+11,-1.78367,-0.03157,-1.86059,-18.75417,-17.66117,-1.22750,"(399673, 2025-03-03 00:00:00)"
54428,399673,2025-02-26,2229.51,2251.65,2252.12,2214.64,24762714.0,1.108523e+11,0.35244,1.37013,-0.31688,12.16203,10.54127,1.34898,"(399673, 2025-03-03 00:00:00)"
54429,399673,2025-02-27,2252.62,2240.61,2265.28,2209.42,26291797.0,1.127419e+11,0.04308,0.60533,-1.87551,6.17494,1.70459,-0.49031,"(399673, 2025-03-03 00:00:00)"
54430,399673,2025-02-28,2220.22,2151.85,2222.84,2145.77,23635027.0,1.032536e+11,-0.91002,-0.79309,-4.23278,-10.10494,-8.41592,-3.96142,"(399673, 2025-03-03 00:00:00)"


In [43]:
# rts_mod = rts.copy()
y_series = df[["ds", "y"]].copy()
y_series.loc[:, "target"] = y_series["y"].shift(-1)
y_series = y_series.dropna(subset=["target"])
y_series = y_series[["ds", "target"]]
rts_mod = rts.merge(y_series, on="ds", how="left")
rts_mod = rts_mod.dropna(subset=["target"])

In [44]:
rts_mod[["ds", "y", "target", "id"]]

Unnamed: 0,ds,y,target,id
0,2014-06-19,-3.59110,1.08187,"(399673, 2014-06-26 00:00:00)"
1,2014-06-20,1.08187,2.25177,"(399673, 2014-06-26 00:00:00)"
2,2014-06-23,2.25177,0.54677,"(399673, 2014-06-26 00:00:00)"
3,2014-06-24,0.54677,-0.64595,"(399673, 2014-06-26 00:00:00)"
4,2014-06-25,-0.64595,2.33750,"(399673, 2014-06-26 00:00:00)"
...,...,...,...,...
54265,2025-02-14,1.52612,0.90691,"(399673, 2025-02-20 00:00:00)"
54266,2025-02-17,0.90691,-1.85116,"(399673, 2025-02-20 00:00:00)"
54267,2025-02-18,-1.85116,1.79514,"(399673, 2025-02-20 00:00:00)"
54268,2025-02-19,1.79514,-0.27519,"(399673, 2025-02-20 00:00:00)"


In [45]:
rts_mod

Unnamed: 0,unique_id,ds,open,close,high,low,volume,amount,open_preclose_rate,high_preclose_rate,low_preclose_rate,vol_change_rate,amt_change_rate,y,id,target
0,399673,2014-06-19,1327.54,1281.12,1335.44,1269.53,4532795.0,9.384841e+09,-0.09783,0.49667,-4.46329,15.08537,11.23805,-3.59110,"(399673, 2014-06-26 00:00:00)",1.08187
1,399673,2014-06-20,1281.14,1294.98,1297.99,1273.18,3111206.0,6.570250e+09,0.00156,1.31682,-0.61977,-31.36231,-29.99082,1.08187,"(399673, 2014-06-26 00:00:00)",2.25177
2,399673,2014-06-23,1295.94,1324.14,1330.24,1295.94,3615497.0,7.837101e+09,0.07413,2.72282,0.07413,16.20886,19.28164,2.25177,"(399673, 2014-06-26 00:00:00)",0.54677
3,399673,2014-06-24,1322.55,1331.38,1335.14,1320.99,3265263.0,6.941363e+09,-0.12008,0.83073,-0.23789,-9.68702,-11.42946,0.54677,"(399673, 2014-06-26 00:00:00)",-0.64595
4,399673,2014-06-25,1330.12,1322.78,1330.70,1310.50,3214760.0,6.779457e+09,-0.09464,-0.05107,-1.56830,-1.54667,-2.33248,-0.64595,"(399673, 2014-06-26 00:00:00)",2.33750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54265,399673,2025-02-14,2162.53,2198.67,2209.31,2162.32,24836851.0,1.009813e+11,-0.14268,2.01744,-0.15238,4.04892,-0.00564,1.52612,"(399673, 2025-02-20 00:00:00)",0.90691
54266,399673,2025-02-17,2225.66,2218.61,2231.44,2200.49,27426155.0,1.113471e+11,1.22756,1.49045,0.08278,10.42525,10.26512,0.90691,"(399673, 2025-02-20 00:00:00)",-1.85116
54267,399673,2025-02-18,2219.44,2177.54,2228.14,2169.32,25386675.0,1.028330e+11,0.03741,0.42955,-2.22166,-7.43626,-7.64648,-1.85116,"(399673, 2025-02-20 00:00:00)",1.79514
54268,399673,2025-02-19,2168.56,2216.63,2218.17,2167.93,21016802.0,8.969812e+10,-0.41239,1.86587,-0.44132,-17.21325,-12.77300,1.79514,"(399673, 2025-02-20 00:00:00)",-0.27519


In [46]:
y_series = rts_mod.groupby("id")["target"].last()
x_series = rts_mod.drop(columns=["unique_id", "target"])

In [47]:
y_series

id
(399673, 2014-06-26 00:00:00)    0.15809
(399673, 2014-06-27 00:00:00)    0.97504
(399673, 2014-06-30 00:00:00)    0.46528
(399673, 2014-07-01 00:00:00)    0.33517
(399673, 2014-07-02 00:00:00)   -0.47173
                                  ...   
(399673, 2025-02-14 00:00:00)    0.90691
(399673, 2025-02-17 00:00:00)   -1.85116
(399673, 2025-02-18 00:00:00)    1.79514
(399673, 2025-02-19 00:00:00)   -0.27519
(399673, 2025-02-20 00:00:00)    2.53876
Name: target, Length: 2590, dtype: float64

In [48]:
features = extract_relevant_features(x_series, y_series, column_id="id", column_sort="ds")

Feature Extraction: 100%|██████████| 20/20 [11:04<00:00, 33.23s/it]


In [49]:
features

Unnamed: 0,Unnamed: 1,low_preclose_rate__variance_larger_than_standard_deviation,high_preclose_rate__variance_larger_than_standard_deviation
399673,2014-06-26,1.0,1.0
399673,2014-06-27,1.0,0.0
399673,2014-06-30,1.0,0.0
399673,2014-07-01,1.0,0.0
399673,2014-07-02,1.0,0.0
399673,...,...,...
399673,2025-02-14,0.0,1.0
399673,2025-02-17,0.0,1.0
399673,2025-02-18,0.0,1.0
399673,2025-02-19,0.0,1.0


In [66]:
# Reset the second level of the index
features.reset_index().rename(columns={'level_1': 'ds'}).drop("level_0", axis=1)

ds                                                             datetime64[ns]
low_preclose_rate__variance_larger_than_standard_deviation            float64
high_preclose_rate__variance_larger_than_standard_deviation           float64
dtype: object

In [65]:
features.dtypes

low_preclose_rate__variance_larger_than_standard_deviation     float64
high_preclose_rate__variance_larger_than_standard_deviation    float64
dtype: object

In [53]:
features.index.names

FrozenList([None, None])

In [52]:
features.reset_index()

Unnamed: 0,level_0,level_1,low_preclose_rate__variance_larger_than_standard_deviation,high_preclose_rate__variance_larger_than_standard_deviation
0,399673,2014-06-26,1.0,1.0
1,399673,2014-06-27,1.0,0.0
2,399673,2014-06-30,1.0,0.0
3,399673,2014-07-01,1.0,0.0
4,399673,2014-07-02,1.0,0.0
...,...,...,...,...
2585,399673,2025-02-14,0.0,1.0
2586,399673,2025-02-17,0.0,1.0
2587,399673,2025-02-18,0.0,1.0
2588,399673,2025-02-19,0.0,1.0


In [57]:
ts_df = (features.reset_index().rename(columns={"level_0": "symbol", "level_1": "date"}))

In [58]:
ts_df

Unnamed: 0,symbol,date,low_preclose_rate__variance_larger_than_standard_deviation,high_preclose_rate__variance_larger_than_standard_deviation
0,399673,2014-06-26,1.0,1.0
1,399673,2014-06-27,1.0,0.0
2,399673,2014-06-30,1.0,0.0
3,399673,2014-07-01,1.0,0.0
4,399673,2014-07-02,1.0,0.0
...,...,...,...,...
2585,399673,2025-02-14,0.0,1.0
2586,399673,2025-02-17,0.0,1.0
2587,399673,2025-02-18,0.0,1.0
2588,399673,2025-02-19,0.0,1.0


In [59]:
melt_ts_df = ts_df.melt(id_vars=["symbol", "date"], var_name="feature", value_name="value")

In [60]:
melt_ts_df

Unnamed: 0,symbol,date,feature,value
0,399673,2014-06-26,low_preclose_rate__variance_larger_than_standa...,1.0
1,399673,2014-06-27,low_preclose_rate__variance_larger_than_standa...,1.0
2,399673,2014-06-30,low_preclose_rate__variance_larger_than_standa...,1.0
3,399673,2014-07-01,low_preclose_rate__variance_larger_than_standa...,1.0
4,399673,2014-07-02,low_preclose_rate__variance_larger_than_standa...,1.0
...,...,...,...,...
5175,399673,2025-02-14,high_preclose_rate__variance_larger_than_stand...,1.0
5176,399673,2025-02-17,high_preclose_rate__variance_larger_than_stand...,1.0
5177,399673,2025-02-18,high_preclose_rate__variance_larger_than_stand...,1.0
5178,399673,2025-02-19,high_preclose_rate__variance_larger_than_stand...,1.0


In [28]:
df["target"] = df['y'].shift(-1)

In [30]:
df = df.dropna(subset=['target'])

In [37]:
X = df.drop(columns=['target'])
y = df['target']

In [38]:
features = extract_relevant_features(X, y, column_id="unique_id", column_sort="ds")
features

ValueError: The following ids are in the time series container but are missing in y: {'399673'}

In [24]:
input_df = df[["y"]]

In [25]:
input_df

Unnamed: 0,y
0,-3.59110
1,1.08187
2,2.25177
3,0.54677
4,-0.64595
...,...
2591,0.90691
2592,-1.85116
2593,1.79514
2594,-0.27519


In [26]:
ts_x, ts_y = make_forecasting_frame(input_df, kind="y", rolling_direction=1, max_timeshift=50)

ValueError: Data must be 1-dimensional, got ndarray of shape (2596, 1) instead

# Debug

In [6]:
import pandas as pd

y = pd.read_pickle("/Users/jz/Downloads/y_20250309205918.pkl")
x = pd.read_pickle("/Users/jz/Downloads/x_20250309205918.pkl")

In [7]:
x

Unnamed: 0,ds,y,vol_change_rate,amt_change_rate,open,close,high,low,volume,amount,open_preclose_rate,high_preclose_rate,low_preclose_rate,id
0,2020-01-02,1.97200,44.69363,36.62403,1005.62,1019.72,1019.91,1005.62,2193852.0,9.315105e+09,0.56200,1.99100,0.56200,"(000688, 2020-01-02 00:00:00)"
1,2020-01-02,1.97200,44.69363,36.62403,1005.62,1019.72,1019.91,1005.62,2193852.0,9.315105e+09,0.56200,1.99100,0.56200,"(000688, 2020-01-03 00:00:00)"
2,2020-01-03,0.46679,-1.51647,3.88898,1021.67,1024.48,1029.07,1018.68,2160583.0,9.677367e+09,0.19123,0.91692,-0.10199,"(000688, 2020-01-03 00:00:00)"
3,2020-01-02,1.97200,44.69363,36.62403,1005.62,1019.72,1019.91,1005.62,2193852.0,9.315105e+09,0.56200,1.99100,0.56200,"(000688, 2020-01-06 00:00:00)"
4,2020-01-03,0.46679,-1.51647,3.88898,1021.67,1024.48,1029.07,1018.68,2160583.0,9.677367e+09,0.19123,0.91692,-0.10199,"(000688, 2020-01-06 00:00:00)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25993,2025-02-24,0.48229,-7.56070,-11.57092,1093.03,1100.05,1110.06,1084.57,47334072.0,1.893881e+11,-0.15894,1.39664,-0.93170,"(000688, 2025-02-28 00:00:00)"
25994,2025-02-25,0.25908,-6.98244,-4.21336,1083.94,1102.90,1119.30,1079.49,44028999.0,1.814085e+11,-1.46448,1.74992,-1.86901,"(000688, 2025-02-28 00:00:00)"
25995,2025-02-26,2.16883,9.44743,6.62402,1108.58,1126.82,1129.63,1093.27,48188607.0,1.934250e+11,0.51501,2.42361,-0.87315,"(000688, 2025-02-28 00:00:00)"
25996,2025-02-27,-0.04704,0.46803,-0.17962,1127.25,1126.29,1140.37,1101.52,48414146.0,1.930776e+11,0.03816,1.20250,-2.24526,"(000688, 2025-02-28 00:00:00)"


In [8]:
y

id
(000688, 2020-01-02 00:00:00)    0.46679
(000688, 2020-01-03 00:00:00)    0.27331
(000688, 2020-01-06 00:00:00)    1.53804
(000688, 2020-01-07 00:00:00)    0.77463
(000688, 2020-01-08 00:00:00)    2.58286
                                  ...   
(000688, 2025-02-24 00:00:00)    0.25908
(000688, 2025-02-25 00:00:00)    2.16883
(000688, 2025-02-26 00:00:00)   -0.04704
(000688, 2025-02-27 00:00:00)   -4.21650
(000688, 2025-02-28 00:00:00)   -1.52484
Name: target, Length: 1248, dtype: float64

In [11]:
features = extract_relevant_features(x, y, column_id="id", column_sort="ds")

Feature Extraction: 100%|██████████| 40/40 [00:51<00:00,  1.29s/it]


In [12]:
features

Unnamed: 0,Unnamed: 1
688,2020-01-02
688,2020-01-03
688,2020-01-06
688,2020-01-07
688,2020-01-08
688,...
688,2025-02-24
688,2025-02-25
688,2025-02-26
688,2025-02-27
