In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

LOOKBACK_WINDOW = 60  
TARGET_COLUMN = 'adj close' 

def setup_data(csv_path):    
    df = pd.read_csv(csv_path)
    
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
        df = df.sort_values('date', ascending=True)
        start_date = '1993-01-29'
        end_date = '2023-12-28'
        mask = (df['date'] >= start_date) & (df['date'] <= end_date)
        df = df.loc[mask]
    
    feature_cols = ['volume', 'open', 'high', 'low', 'close', 'adj close']
    
    missing_cols = [col for col in feature_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Mancano le seguenti colonne nel CSV: {missing_cols}")
    
    data = df[feature_cols].values
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(data)
    
    
    target_idx = feature_cols.index(TARGET_COLUMN)
    
    X, y = [], []
    for i in range(LOOKBACK_WINDOW, len(scaled_data)):
        X.append(scaled_data[i-LOOKBACK_WINDOW:i])
        y.append(scaled_data[i, target_idx])
        
    X = np.array(X)
    y = np.array(y)
    
    print(f"Shape totale dei dati sequenziati: Input {X.shape}, Target {y.shape}")
    
    
    total_samples = len(X)
    train_split = int(total_samples * 0.70)
    val_split = int(total_samples * 0.85)
    
    X_train = X[:train_split]
    y_train = y[:train_split]
    
    X_val = X[train_split:val_split]
    y_val = y[train_split:val_split]
    
    X_test = X[val_split:]
    y_test = y[val_split:]
    
    print(f"Dataset suddiviso:")
    print(f" - Train: {X_train.shape[0]} campioni")
    print(f" - Validation: {X_val.shape[0]} campioni")
    print(f" - Test: {X_test.shape[0]} campioni")
    
    return X_train, y_train, X_val, y_val, X_test, y_test, scaler


X_train, y_train, X_val, y_val, X_test, y_test, scaler = setup_data('C:/Users/angel/OneDrive/Desktop/ProgettoMeDL/SPY.csv')

Shape totale dei dati sequenziati: Input (7725, 60, 6), Target (7725,)
Dataset suddiviso:
 - Train: 5407 campioni
 - Validation: 1159 campioni
 - Test: 1159 campioni


In [3]:
import os
import joblib 

SAVE_DIR = 'C:/Users/angel/OneDrive/Desktop/ProgettoMeDL/Financial_Forecasting_XAI/data_split'
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

np.save(os.path.join(SAVE_DIR, 'X_train.npy'), X_train)
np.save(os.path.join(SAVE_DIR, 'y_train.npy'), y_train)

np.save(os.path.join(SAVE_DIR, 'X_val.npy'), X_val)
np.save(os.path.join(SAVE_DIR, 'y_val.npy'), y_val)

np.save(os.path.join(SAVE_DIR, 'X_test.npy'), X_test)
np.save(os.path.join(SAVE_DIR, 'y_test.npy'), y_test)

joblib.dump(scaler, os.path.join(SAVE_DIR, 'scaler.pkl'))


['C:/Users/angel/OneDrive/Desktop/ProgettoMeDL/Financial_Forecasting_XAI/data_split\\scaler.pkl']