## Import Packages

In [1]:
import os
import shutil
import duckdb
import yaml
from sklearn.model_selection import train_test_split

## Set Parameters

In [6]:
CONFIG_FILE = "/workspaces/valuation/config.yaml"
DATA_SOURCE_FOLDER = "/workspaces/valuation/data"
DATA_SOURCE_FILENAME = "stock_prices.duckdb"
DATA_SOURCE_TABLE = "gold_stock_price_labeled"

DATA_DESTINATION_FOLDER = "/workspaces/valuation/data/staging/stock_price_labeled"

# Load config file
with open(CONFIG_FILE, 'r') as config_file:
    config = yaml.safe_load(config_file)

SEED = config['seed']
TICKER = config['ticker']
TEST_SIZE = config['test_size_in_days'] # Number of days in the tail.


## Read Data Source

In [3]:

db_path = os.path.join(DATA_SOURCE_FOLDER, DATA_SOURCE_FILENAME)

# Create or connect to the DuckDB database
conn = duckdb.connect(database=db_path, read_only=False)

# Read data
df = conn.sql(f"SELECT * FROM {DATA_SOURCE_TABLE} WHERE Ticker = '{TICKER}'").fetchdf()

# Close the DuckDB connection
conn.close()

## Remove Nulls

In [4]:
df.dropna(inplace=True)

## Split the data into Train and Test

In [5]:
# Sorting the dataframe by the date column 'ds'
df_sorted = df.sort_values(by='ds')

# Split the dataframe
train_df = df_sorted.iloc[:-TEST_SIZE]
test_df = df_sorted.iloc[-TEST_SIZE:]



## Save Pandas dataframe to disk

In [6]:
# Clear the destination folder
if os.path.exists(DATA_DESTINATION_FOLDER):
    shutil.rmtree(DATA_DESTINATION_FOLDER)

os.makedirs(DATA_DESTINATION_FOLDER)
    
# Saving the train and test dataframes to CSV files
train_df.to_csv(os.path.join(DATA_DESTINATION_FOLDER, 'train.csv'), index=False)
test_df.to_csv(os.path.join(DATA_DESTINATION_FOLDER, 'test.csv'), index=False)

print(f"Train data saved to {DATA_DESTINATION_FOLDER}/train.csv' shaped as {train_df.shape}")
print(f"Test data saved to {DATA_DESTINATION_FOLDER}/test.csv shaped as {test_df.shape}")


Train data saved to /workspaces/valuation/data/staging/stock_price_labeled/train.csv' shaped as (1918, 9)
Test data saved to /workspaces/valuation/data/staging/stock_price_labeled/test.csv shaped as (180, 9)
