# Train test Split

In [45]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler

## Data

In [38]:
PHARMA_ETF_TICKER = "PPH"
START_DATE = "2018-01-01"
END_DATE = "2022-12-31"

df = yf.download(PHARMA_ETF_TICKER, start=START_DATE, end=END_DATE, auto_adjust=False)

[*********************100%***********************]  1 of 1 completed


## Train Test Split for Machine learning

In [39]:
# Target y the close stock price of next day
df['y'] = df['Close'].shift(-1)

In [40]:
# The different features

# Moving averages
df['ma_5'] = df['Close'].rolling(5).mean()
df['ma_10'] = df['Close'].rolling(10).mean()

# Rolling volatility
df['vol_5'] = df['Close'].pct_change().rolling(5).std()

# Momentum
df['momentum_5'] = df['Close'] - df['Close'].shift(5)

# OHLC derived features
df['hl_range'] = df['High'] - df['Low']           # daily high-low range
df['oc_change'] = df['Close'] - df['Open']       # daily open-close change
df['volume_change'] = df['Volume'].pct_change()  # daily volume change

# Drop rows with NaN from rolling calculations
df = df.dropna()

In [None]:
X = df[['ma_5', 'ma_10', 'vol_5', 'momentum_5', 'hl_range', 'oc_change', 'volume_change']]
y = df['y']

In [None]:
tscv = TimeSeriesSplit(n_splits=5)
for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

## Train Test Split RNN

In [49]:
# Scale Features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Create Sequences

seq_len = 30  # number of past days per sequence
horizon = 7  # predict 7 days ahead

y_seq = []
X_seq = []

for i in range(len(X) - seq_len - horizon + 1):
    X_seq.append(X_scaled[i:i+seq_len])  # past 30 days of features
    y_seq.append(y.iloc[i+seq_len : i+seq_len+horizon].values)  # next 7 days

X_seq = np.array(X_seq)  # shape: (samples, seq_len, n_features)
y_seq = np.array(y_seq)

# Train_test_split
train_size = int(0.8 * len(X_seq))
X_train_rnn, X_test_rnn = X_seq[:train_size], X_seq[train_size:]
y_train_rnn, y_test_rnn = y_seq[:train_size], y_seq[train_size:]

print("X_train shape:", X_train_rnn.shape)
print("X_test shape:", X_test_rnn.shape)
print("y_train shape:", y_train_rnn.shape)
print("y_test shape:", y_test_rnn.shape)

X_train shape: (970, 30, 7)
X_test shape: (243, 30, 7)
y_train shape: (970, 7)
y_test shape: (243, 7)
