## Import Packages

In [25]:
import os
import duckdb
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

## Set Parameters

In [13]:
DATA_SOURCE_FOLDER = "/workspaces/valuation/data"
OUTPUT_FILENAME = "dfp.duckdb"
SOURCE_TABLE = "gold_dfp_dre_pivoted"

# How many periods to look back (rows in the past).
TIME_STEPS = 3

# Train and test proportion
TRAIN_SIZE = 0,8

## Read Data Source

In [14]:

db_path = os.path.join(DATA_SOURCE_FOLDER, OUTPUT_FILENAME)

# Create or connect to the DuckDB database
conn = duckdb.connect(database=db_path, read_only=False)

# Read data
df = conn.sql(f"SELECT * FROM {SOURCE_TABLE}").fetchdf()

# Close the DuckDB connection
conn.close()

## Feature Engineering

In [15]:
df["DIA_REFER"] = df["DT_REFER"].dt.day
df["MES_REFER"] = df["DT_REFER"].dt.month   
df["ANO_REFER"] = df["DT_REFER"].dt.year

# List of parameter, target must be the first one.
FEATURE_NAMES = ['RECEITA', 'EBIT', 'LAIR', 'PERIODO_MESES', 'DIA_REFER', 'MES_REFER', 'ANO_REFER']

## Create the time steps (lookback)

In [16]:
# Sort by date if necessary
df.sort_values(by=['CD_CVM', 'DT_REFER'], inplace=True)

# Create sequences for each CD_CVM group
grouped = df.groupby('CD_CVM')
X_list, y_list = [], []

for name, group in grouped:
        
    # Select the necessary columns
    data = group[FEATURE_NAMES].values
        
    # Create the sequences
    for i in range(TIME_STEPS, len(data)):
        X_list.append(data[i-TIME_STEPS:i])
        y_list.append(data[i, 0])

# Convert lists to numpy arrays
X, y = np.array(X_list), np.array(y_list)

## Shate the data for LSTM

The shape must be:
(samples, time steps, features)

In [22]:
# Reshape the data to fit the LSTM model (samples, time steps, features)
#X = np.reshape(X, (X.shape[0], X.shape[1], X.shape[2]))