## Import Packages

In [1]:
import os
import shutil
import duckdb
import yaml
from sklearn.model_selection import train_test_split

## Set Parameters

In [2]:
CONFIG_FILE = "/workspaces/valuation/config.yaml"
DATA_SOURCE_FOLDER = "/workspaces/valuation/data"
DATA_SOURCE_FILENAME = "stock_prices.duckdb"
DATA_SOURCE_TABLE = "gold_stock_price_labeled"

# Load config file
with open(CONFIG_FILE, 'r') as config_file:
    config = yaml.safe_load(config_file)

SEED = config['seed']
TICKER = config['ticker']
TEST_SIZE = config['test_size_in_days'] # Number of days in the tail.

DATA_DESTINATION_FOLDER = os.path.join("/workspaces/valuation/data/staging/stocks", TICKER)


## Read Data Source

In [3]:

db_path = os.path.join(DATA_SOURCE_FOLDER, DATA_SOURCE_FILENAME)

# Create or connect to the DuckDB database
conn = duckdb.connect(database=db_path, read_only=False)

# Read data
df = conn.sql(f"SELECT * FROM {DATA_SOURCE_TABLE} WHERE Ticker = '{TICKER}'").fetchdf()

# Close the DuckDB connection
conn.close()

## Remove Nulls

In [4]:
df.dropna(inplace=True)

## Split the data into Train and Test

In [5]:
# Sorting the dataframe by the date column 'ds'
df_sorted = df.sort_values(by='ds')

# Split the dataframe
train_df = df_sorted.iloc[:-TEST_SIZE]
test_df = df_sorted.iloc[-TEST_SIZE:]



## Save Pandas dataframe to disk

In [6]:
# Create the destination folder if it doesn't exist
if not os.path.exists(DATA_DESTINATION_FOLDER):
    os.makedirs(DATA_DESTINATION_FOLDER)

else:
    # Remove existing train.csv and test.csv files if they exist
    train_file = os.path.join(DATA_DESTINATION_FOLDER, 'train.csv')
    test_file = os.path.join(DATA_DESTINATION_FOLDER, 'test.csv')

    if os.path.exists(train_file):
        os.remove(train_file)
    if os.path.exists(test_file):
        os.remove(test_file)
   
# Saving the train and test dataframes to CSV files
train_df.to_csv(train_file, index=False)
test_df.to_csv(test_file, index=False)

print(f"Train data saved to {DATA_DESTINATION_FOLDER}/train.csv' shaped as {train_df.shape}")
print(f"Test data saved to {DATA_DESTINATION_FOLDER}/test.csv shaped as {test_df.shape}")


Train data saved to /workspaces/valuation/data/staging/stocks/VALE3.SA/train.csv' shaped as (2108, 9)
Test data saved to /workspaces/valuation/data/staging/stocks/VALE3.SA/test.csv shaped as (60, 9)
