# Train test Split

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler

## Data

In [2]:
df = pd.read_parquet("data/final_features.parquet")

In [3]:
df.shape

(2891, 268)

## Train Test Split for Machine learning

In [5]:
X.shape

(2891, 267)

In [6]:
tscv = TimeSeriesSplit(n_splits=5)
for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [7]:
X_train.shape

(2410, 267)

## Train Test Split RNN

In [8]:
# Scale Features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Create Sequences

seq_len = 30  # number of past days per sequence
horizon = 30  # predict 7 days ahead

y_seq = []
X_seq = []

for i in range(len(X) - seq_len - horizon + 1):
    X_seq.append(X_scaled[i:i+seq_len])  # past 30 days of features
    y_seq.append(y.iloc[i+seq_len : i+seq_len+horizon].values)  # next 7 days

X_seq = np.array(X_seq)  # shape: (samples, seq_len, n_features)
y_seq = np.array(y_seq)

# Train_test_split
train_size = int(0.8 * len(X_seq))
X_train_rnn, X_test_rnn = X_seq[:train_size], X_seq[train_size:]
y_train_rnn, y_test_rnn = y_seq[:train_size], y_seq[train_size:]

print("X_train shape:", X_train_rnn.shape)
print("X_test shape:", X_test_rnn.shape)
print("y_train shape:", y_train_rnn.shape)
print("y_test shape:", y_test_rnn.shape)

X_train shape: (2265, 30, 267)
X_test shape: (567, 30, 267)
y_train shape: (2265, 30)
y_test shape: (567, 30)


In [None]:
np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)