## Import Packages

In [1]:
import os
import duckdb

## Set Parameters

In [2]:
DATA_SOURCE_FOLDER = "/workspaces/valuation/data/staging/stocks"
DATA_DESTINATION_FOLDER = "/workspaces/valuation/data"

SOURCE_PATH = "historical_prices.csv"
DESTINATION_PATH = "stock_prices.duckdb"

TICKER = "VALE5.SA"

## Read Data Source

In [3]:

db_path = os.path.join(DATA_SOURCE_FOLDER, OUTPUT_FILENAME)

# Create or connect to the DuckDB database
conn = duckdb.connect(database=db_path, read_only=False)

# Read data
df = conn.sql(f"SELECT * FROM {SOURCE_TABLE}").fetchdf()

# Close the DuckDB connection
conn.close()

## Remove Nulls

Remove rows with at least one null value


In [4]:
df.dropna(inplace=True)

In [5]:
#ProfileReport(df, title="Profiling Report")

## Feature Engineering

In [6]:
df["DIA_REFER"] = df["DT_REFER"].dt.day
df["MES_REFER"] = df["DT_REFER"].dt.month   
df["ANO_REFER"] = df["DT_REFER"].dt.year

# List of parameter, target must be the first one.
FEATURE_NAMES = ['RECEITA', 'EBIT', 'LAIR', 'PERIODO_MESES', 'DIA_REFER', 'MES_REFER', 'ANO_REFER']

## Create the time steps (lookback)

The shape must be:
(samples, time steps, features)

In [7]:
# Sort by date if necessary
df.sort_values(by=['CD_CVM', 'DT_REFER'], inplace=True)

# Create sequences for each CD_CVM group
grouped = df.groupby('CD_CVM')
X_train_list, y_train_list = [], []
X_test_list, y_test_list = [], []

for name, group in grouped:

    X_list, y_list = [], []
        
    # Select the necessary columns
    data = group[FEATURE_NAMES].values
        
    # Create the sequences
    for i in range(TIME_STEPS, len(data)):
        X_list.append(data[i-TIME_STEPS:i])
        y_list.append(data[i, 0])

    # Split into train and test
    split_index = len(X_list) - TEST_SIZE
    X_train_list.extend(X_list[:split_index])
    y_train_list.extend(y_list[:split_index])
    X_test_list.extend(X_list[split_index:])
    y_test_list.extend(y_list[split_index:])
    
# Convert lists to numpy arrays
X_train, y_train = np.array(X_train_list, dtype=np.float32), np.array(y_train_list, dtype=np.float32)
X_test, y_test = np.array(X_test_list, dtype=np.float32), np.array(y_test_list, dtype=np.float32)

# Reshape the targets from (n,) to (n,1)
y_train = np.reshape(y_train, (len(y_train), 1))
y_test = np.reshape(y_test, (len(y_test), 1))

## Normalize the data

In [8]:
# Apply MinMaxScaler
#scaler = MinMaxScaler(feature_range=(0,1))

# Fit the scaler on the training data and transform both train and test data
#X_train_scaled = scaler.fit_transform(X_train.reshape(-1, X_train.shape[-1])).reshape(X_train.shape)
#X_test_scaled = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)



## Save numpy arrays to disk

In [9]:
if not os.path.exists(NUMPY_DATA_DESTINATION):
    os.makedirs(NUMPY_DATA_DESTINATION)
    
np.save(os.path.join(NUMPY_DATA_DESTINATION, 'X_train.npy'), X_train)
np.save(os.path.join(NUMPY_DATA_DESTINATION, 'y_train.npy'), y_train)
np.save(os.path.join(NUMPY_DATA_DESTINATION, 'X_test.npy'), X_test)
np.save(os.path.join(NUMPY_DATA_DESTINATION, 'y_test.npy'), y_test)

# Print shapes to verify the split and scaling
print("X_train_scaled shape:", X_train.shape)
print("X_test_scaled shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train_scaled shape: (9511, 3, 7)
X_test_scaled shape: (1060, 3, 7)
y_train shape: (9511, 1)
y_test shape: (1060, 1)
