In [2]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.preprocessing import MinMaxScaler

NEWS_CSV_PATH = 'C:/Users/angel/OneDrive/Desktop/ProgettoMeDL/sentiments/spy_news_sentiment_titolo.csv' 
OUTPUT_DIR = 'C:/Users/angel/OneDrive/Desktop/ProgettoMeDL/Financial_Forecasting_XAI/data_split_titolo' 
SPY_PATH = 'C:/Users/angel/OneDrive/Desktop/ProgettoMeDL/SPY.csv'

LOOKBACK_WINDOW = 60
TARGET_COLUMN = 'adj close'

def create_sequences(data, lookback, target_idx):
    """Crea sequenze X,y da dati scalati"""
    X, y = [], []
    for i in range(lookback, len(data)):
        X.append(data[i-lookback:i])
        y.append(data[i, target_idx])
    return np.array(X), np.array(y)

def main():
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        
    print(f"--- GENERAZIONE DATASET (LEXRANK) ---")

    # 1. CARICAMENTO E PULIZIA NEWS
    print("1. Elaborazione News...")
    df_news = pd.read_csv(NEWS_CSV_PATH)
    
    # Verifica colonne necessarie
    required_cols = ['Date', 'sentiment_positive', 'sentiment_negative']
    if not all(col in df_news.columns for col in required_cols):
        raise ValueError(f"CSV news deve contenere: {required_cols}")
    
    # Parsing Date News
    df_news['Date'] = pd.to_datetime(df_news['Date'], utc=True).dt.date
    df_news['Date'] = pd.to_datetime(df_news['Date'])
    
    # Calcolo Sentiment (come in FinBERT)
    df_news['sentiment_score'] = df_news['sentiment_positive'] - df_news['sentiment_negative']
    
    news_start_date = df_news['Date'].min()
    print(f"   Inizio News rilevato: {news_start_date.date()}")
    
    # Raggruppamento giornaliero
    daily_sentiment = df_news.groupby('Date')['sentiment_score'].mean().reset_index()

    # 2. CARICAMENTO E FILTRAGGIO SPY
    print("2. Elaborazione SPY...")
    df = pd.read_csv(SPY_PATH)
    df.columns = df.columns.str.lower() 
    
    if 'date' not in df.columns:
        raise ValueError("Colonna 'date' non trovata in SPY.csv")
        
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date', ascending=True)
    
    # CORREZIONE: Buffer basato su LOOKBACK_WINDOW
    start_date_limit = news_start_date - pd.Timedelta(days=LOOKBACK_WINDOW + 30)
    end_date_limit = '2023-12-28'
    
    print(f"   Filtraggio SPY: {start_date_limit.date()} -> {end_date_limit}")
    
    mask = (df['date'] >= start_date_limit) & (df['date'] <= end_date_limit)
    df = df.loc[mask].copy()
    
    # Verifica continuità date
    date_diffs = df['date'].diff().dt.days
    max_gap = date_diffs.max()
    if max_gap > 7:  # Più di una settimana di gap
        print(f" Gap massimo tra date: {max_gap} giorni")
    
    # 3. MERGE
    print("3. Unione Dataset...")
    df['date_clean'] = pd.to_datetime(df['date'].dt.date)
    
    df_merged = pd.merge(
        df, 
        daily_sentiment, 
        left_on='date_clean', 
        right_on='Date', 
        how='left'
    )
    
    # CORREZIONE: Solo fillna(0) per sentiment neutro
    df_merged['sentiment_score'] = df_merged['sentiment_score'].fillna(0)
    
    # Pulizia colonne ridondanti
    df_merged = df_merged.drop(columns=['date_clean', 'Date'], errors='ignore')
    
    # Statistiche
    non_zeros = (df_merged['sentiment_score'] != 0).sum()
    print(f"   Giorni con sentiment attivo: {non_zeros} su {len(df_merged)}")
    print(f"   Sentiment range: [{df_merged['sentiment_score'].min():.3f}, {df_merged['sentiment_score'].max():.3f}]")

    # 4. SPLIT TEMPORALE (PRIMA DELLO SCALING!)
    print("4. Split Temporale...")
    financial_cols = ['volume', 'open', 'high', 'low', 'close', 'adj close']
    sentiment_col = ['sentiment_score']
    feature_cols = financial_cols + sentiment_col
    
    total_samples = len(df_merged)
    train_split_idx = int(total_samples * 0.70)
    val_split_idx = int(total_samples * 0.85)
    
    df_train = df_merged.iloc[:train_split_idx].copy()
    df_val = df_merged.iloc[train_split_idx:val_split_idx].copy()
    df_test = df_merged.iloc[val_split_idx:].copy()
    
    print(f"   Train: {len(df_train)} | Val: {len(df_val)} | Test: {len(df_test)}")
    print(f"   Date range Train: {df_train['date'].min().date()} -> {df_train['date'].max().date()}")
    print(f"   Date range Val:   {df_val['date'].min().date()} -> {df_val['date'].max().date()}")
    print(f"   Date range Test:  {df_test['date'].min().date()} -> {df_test['date'].max().date()}")

    # 5. SCALING (FIT SOLO SU TRAIN!)
    print("5. Scaling...")
    
    # Scaler per dati finanziari
    scaler_financial = MinMaxScaler(feature_range=(0, 1))
    scaler_financial.fit(df_train[financial_cols].values)  # FIT SOLO SU TRAIN
    
    scaled_financial_train = scaler_financial.transform(df_train[financial_cols].values)
    scaled_financial_val = scaler_financial.transform(df_val[financial_cols].values)
    scaled_financial_test = scaler_financial.transform(df_test[financial_cols].values)
    
    # Scaler per sentiment
    scaler_sentiment = MinMaxScaler(feature_range=(0, 1))
    scaler_sentiment.fit(df_train[sentiment_col].values)  # FIT SOLO SU TRAIN
    
    scaled_sentiment_train = scaler_sentiment.transform(df_train[sentiment_col].values)
    scaled_sentiment_val = scaler_sentiment.transform(df_val[sentiment_col].values)
    scaled_sentiment_test = scaler_sentiment.transform(df_test[sentiment_col].values)
    
    # Concatenazione
    scaled_train = np.concatenate([scaled_financial_train, scaled_sentiment_train], axis=1)
    scaled_val = np.concatenate([scaled_financial_val, scaled_sentiment_val], axis=1)
    scaled_test = np.concatenate([scaled_financial_test, scaled_sentiment_test], axis=1)
    
    # Salva scalers
    joblib.dump({
        'financial': scaler_financial,
        'sentiment': scaler_sentiment
    }, os.path.join(OUTPUT_DIR, 'scaler.pkl'))
    
    # 6. CREAZIONE SEQUENZE
    print("6. Creazione Sequenze...")
    target_idx = feature_cols.index(TARGET_COLUMN)
    
    X_train, y_train = create_sequences(scaled_train, LOOKBACK_WINDOW, target_idx)
    X_val, y_val = create_sequences(scaled_val, LOOKBACK_WINDOW, target_idx)
    X_test, y_test = create_sequences(scaled_test, LOOKBACK_WINDOW, target_idx)
    
    print(f"   Train: {X_train.shape[0]} | Val: {X_val.shape[0]} | Test: {X_test.shape[0]}")
    print(f"   Input Shape: {X_train.shape}")
    
    # CORREZIONE: Verifica shape
    assert X_train.shape[2] == len(feature_cols), \
        f"Expected {len(feature_cols)} features, got {X_train.shape[2]}"
    assert X_train.shape[1] == LOOKBACK_WINDOW, \
        f"Expected lookback {LOOKBACK_WINDOW}, got {X_train.shape[1]}"

    # 7. SALVATAGGIO
    print("7. Salvataggio...")
    np.save(os.path.join(OUTPUT_DIR, 'X_train.npy'), X_train)
    np.save(os.path.join(OUTPUT_DIR, 'y_train.npy'), y_train)
    np.save(os.path.join(OUTPUT_DIR, 'X_val.npy'), X_val)
    np.save(os.path.join(OUTPUT_DIR, 'y_val.npy'), y_val)
    np.save(os.path.join(OUTPUT_DIR, 'X_test.npy'), X_test)
    np.save(os.path.join(OUTPUT_DIR, 'y_test.npy'), y_test)
    
    # Salva anche info sul dataset
    metadata = {
        'lookback_window': LOOKBACK_WINDOW,
        'target_column': TARGET_COLUMN,
        'feature_columns': feature_cols,
        'train_samples': len(X_train),
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'train_date_range': (str(df_train['date'].min().date()), str(df_train['date'].max().date())),
        'val_date_range': (str(df_val['date'].min().date()), str(df_val['date'].max().date())),
        'test_date_range': (str(df_test['date'].min().date()), str(df_test['date'].max().date()))
    }
    
    joblib.dump(metadata, os.path.join(OUTPUT_DIR, 'metadata.pkl'))
    
    print(f"   - Features: {len(feature_cols)} ({', '.join(feature_cols)})")
    print(f"   - Sequenze: {LOOKBACK_WINDOW} timesteps")
    print(f"   - Target: {TARGET_COLUMN}")
    print(f"   - Train shape: {X_train.shape} -> y: {y_train.shape}")

if __name__ == "__main__":
    main()

--- GENERAZIONE DATASET (LEXRANK) ---
1. Elaborazione News...
   Inizio News rilevato: 2017-08-18
2. Elaborazione SPY...
   Filtraggio SPY: 2017-05-20 -> 2023-12-28
3. Unione Dataset...
   Giorni con sentiment attivo: 1098 su 1663
   Sentiment range: [-0.965, 0.933]
4. Split Temporale...
   Train: 1164 | Val: 249 | Test: 250
   Date range Train: 2017-05-22 -> 2022-01-03
   Date range Val:   2022-01-04 -> 2022-12-29
   Date range Test:  2022-12-30 -> 2023-12-28
5. Scaling...
6. Creazione Sequenze...
   Train: 1104 | Val: 189 | Test: 190
   Input Shape: (1104, 60, 7)
7. Salvataggio...
   - Features: 7 (volume, open, high, low, close, adj close, sentiment_score)
   - Sequenze: 60 timesteps
   - Target: adj close
   - Train shape: (1104, 60, 7) -> y: (1104,)
