In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [17]:
# Load the feature-engineered datasets

nvidia_df = pd.read_csv('../data/nvidia_stock_features_selected.csv', parse_dates=['Date'], index_col='Date')
microsoft_df = pd.read_csv('../data/microsoft_stock_features_selected.csv', parse_dates=['Date'], index_col='Date')
apple_df = pd.read_csv('../data/apple_stock_features_selected.csv', parse_dates=['Date'], index_col='Date')

Since we aim to look at multiple modelling approaches, we will structure our splits accordingly.

**Feature:**
- `Open`,`High`,`Low`,`Close`,`Volume`,`SMA_20`,`SMA_50`,`SMA_200`,`EMA_20`,`EMA_50`,`EMA_200`

**Target:**
- `Future_Return_5d`

In [18]:
# Defining feature columns

feature_cols = ['Open','High','Low','Close','Volume','SMA_20','SMA_50','SMA_200','EMA_20','EMA_50','EMA_200']

target_col = ['Future_Return_5d']

In [19]:
# Extract features and target

X_nvidia, y_nvidia = nvidia_df[feature_cols], nvidia_df[target_col]
X_microsoft, y_microsoft = microsoft_df[feature_cols], microsoft_df[target_col]
X_apple, y_apple = apple_df[feature_cols], apple_df[target_col]

Since time series data required chronological order, we do not shuffle the data. We simply split on data ranges.

In [20]:
# Function for time-based train-test split

def time_series_split(X, y, train_size=0.8):
    split_index = int(len(X)* train_size)
    X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
    y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]
    return X_train, X_test, y_train, y_test

# Apply split to each stock
X_train_nvidia, X_test_nvidia, y_train_nvidia, y_test_nvidia = time_series_split(X_nvidia, y_nvidia)
X_train_microsoft, X_test_microsoft, y_train_microsoft, y_test_microsoft = time_series_split(X_microsoft, y_microsoft)
X_train_apple, X_test_apple, y_train_apple, y_test_apple = time_series_split(X_apple, y_apple)

In [21]:
# Save NVIDIA train-test data
X_train_nvidia.to_csv("../data/tt_splits/X_train_nvidia.csv", index=True)
X_test_nvidia.to_csv("../data/tt_splits/X_test_nvidia.csv", index=True)
y_train_nvidia.to_csv("../data/tt_splits/y_train_nvidia.csv", index=True)
y_test_nvidia.to_csv("../data/tt_splits/y_test_nvidia.csv", index=True)

# Save Microsoft train-test data
X_train_microsoft.to_csv("../data/tt_splits/X_train_microsoft.csv", index=True)
X_test_microsoft.to_csv("../data/tt_splits/X_test_microsoft.csv", index=True)
y_train_microsoft.to_csv("../data/tt_splits/y_train_microsoft.csv", index=True)
y_test_microsoft.to_csv("../data/tt_splits/y_test_microsoft.csv", index=True)

# Save Apple train-test data
X_train_apple.to_csv("../data/tt_splits/X_train_apple.csv", index=True)
X_test_apple.to_csv("../data/tt_splits/X_test_apple.csv", index=True)
y_train_apple.to_csv("../data/tt_splits/y_train_apple.csv", index=True)
y_test_apple.to_csv("../data/tt_splits/y_test_apple.csv", index=True)

print("Train-test datasets saved.")

Train-test datasets saved.
