## Import Packages

In [7]:
import os
import shutil
import duckdb
from sklearn.model_selection import train_test_split

## Set Parameters

In [8]:
DATA_SOURCE_FOLDER = "/workspaces/valuation/data"
DATA_SOURCE_FILENAME = "stock_prices.duckdb"
DATA_SOURCE_TABLE = "gold_stock_price_labeled"

DATA_DESTINATION_FOLDER = "/workspaces/valuation/data/staging/stock_price_labeled"

TEST_SIZE = 0.00001

SEED = 1407

TICKER = "VALE5.SA"

## Read Data Source

In [9]:

db_path = os.path.join(DATA_SOURCE_FOLDER, DATA_SOURCE_FILENAME)

# Create or connect to the DuckDB database
conn = duckdb.connect(database=db_path, read_only=False)

# Read data
df = conn.sql(f"SELECT * FROM {DATA_SOURCE_TABLE} WHERE Ticker = '{TICKER}'").fetchdf()

# Close the DuckDB connection
conn.close()

In [15]:
len(df)

765

## Split the data into Train and Test

In [10]:
# Sorting the dataframe by the date column 'ds'
df_sorted = df.sort_values(by='ds')

# Splitting the sorted dataframe into train and test sets
train_df, test_df = train_test_split(df_sorted, test_size=TEST_SIZE, random_state=SEED, shuffle=False)


In [14]:
len(train_df)

764

## Save Pandas dataframe to disk

In [12]:
# Clear the destination folder
if os.path.exists(DATA_DESTINATION_FOLDER):
    shutil.rmtree(DATA_DESTINATION_FOLDER)

os.makedirs(DATA_DESTINATION_FOLDER)
    
# Saving the train and test dataframes to CSV files
train_df.to_csv(os.path.join(DATA_DESTINATION_FOLDER, 'train.csv'), index=False)
test_df.to_csv(os.path.join(DATA_DESTINATION_FOLDER, 'test.csv'), index=False)

print(f"Train data saved to {DATA_DESTINATION_FOLDER}/train.csv'")
print(f"Test data saved to {DATA_DESTINATION_FOLDER}/test.csv")


Train data saved to /workspaces/valuation/data/staging/stock_price_labeled/train.csv'
Test data saved to /workspaces/valuation/data/staging/stock_price_labeled/test.csv
