In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import sys
from typing import Dict, Any, Union

def verify_file(file_path: str) -> bool:
    """Verify that a file exists and has content."""
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return False
    file_size = os.path.getsize(file_path)
    if file_size == 0:
        print(f"Warning: {file_path} is empty (0 bytes)")
        return False
    print(f"Found {file_path} ({file_size} bytes)")
    return True

def load_pickle_file(file_path: str) -> Any:
    """Safely load a pickle file with multiple fallback methods."""
    try:
        with open(file_path, 'rb') as f:
            # First try standard pickle load
            return pickle.load(f)
    except pickle.UnpicklingError:
        try:
            # Try with pandas if standard pickle fails
            return pd.read_pickle(file_path)
        except Exception as e:
            print(f"Pandas pickle load failed: {str(e)}")
            try:
                # Try with different protocol versions
                with open(file_path, 'rb') as f:
                    return pickle.load(f, encoding='latin1')
            except Exception as e:
                print(f"All pickle loading methods failed: {str(e)}")
                return None
    except EOFError:
        print("EOFError: Ran out of input - file may be corrupted or incomplete")
        return None
    except Exception as e:
        print(f"Unexpected error loading pickle: {str(e)}")
        return None

def load_npy_file(file_path: str) -> np.ndarray:
    """Safely load a numpy .npy file with proper pickle settings."""
    try:
        # Enable allow_pickle for object arrays
        data = np.load(file_path, allow_pickle=True)
        # Handle 0-dimensional arrays
        if data.ndim == 0:
            return np.array([data.item()])  # Convert to 1D array
        return data
    except Exception as e:
        print(f"Error loading .npy file: {str(e)}")
        return None

def read_exchange_data(exchange: str, data_summary: Dict[str, Dict[str, Any]]) -> None:
    """Read data for a specific exchange (NASDAQ/NYSE)."""
    exchange_path = os.path.join('dataset', exchange)
    if not os.path.exists(exchange_path):
        print(f"\nERROR: {exchange} folder not found at: {exchange_path}")
        return
    
    print(f"\nProcessing {exchange} files in: {exchange_path}")
    file_types = ['eod_data.pkl', 'gt_data.pkl', 'mask_data.pkl', 'price_data.pkl']
    
    for file_type in file_types:
        file_path = os.path.join(exchange_path, file_type)
        print(f"\nAttempting to load: {file_path}")
        
        if not verify_file(file_path):
            data_summary[exchange][file_type] = None
            continue
            
        data = load_pickle_file(file_path)
        data_summary[exchange][file_type] = data
        
        if data is not None:
            print(f"Successfully loaded {file_type}")
            print_sample_data(data)

def read_sp500_data(data_summary: Dict[str, Dict[str, Any]]) -> None:
    """Read S&P500 data files."""
    sp500_path = os.path.join('dataset', 'SP500')
    if not os.path.exists(sp500_path):
        print(f"\nERROR: SP500 folder not found at: {sp500_path}")
        return
    
    print(f"\nProcessing SP500 files in: {sp500_path}")
    sp500_files = ['SP500.npy']
    # sp500_files = ['baseline_data_sp500.npy']
    
    for file in sp500_files:
        file_path = os.path.join(sp500_path, file)
        if not os.path.exists(file_path):
            continue
            
        print(f"\nAttempting to load: {file_path}")
        if not verify_file(file_path):
            data_summary['SP500'][file] = None
            continue
        print(f"Loading {file} with numpy...")  
        data = load_npy_file(file_path)
        print(f"Loaded {file} with shape {data.shape} and dtype {data.dtype}")
        data_summary['SP500'][file] = data
        
        if data is not None:
            print(f"Successfully loaded {file}")
            print_sample_data(data)

def print_sample_data(data: Any) -> None:
    """Print sample data from loaded file, handling various types safely."""
    print(f"Data type: {type(data)}")
    print("dtype:", data.dtype)
    print("data size:", data.size)
    print("data.ndim", data.ndim)
    
    try:
        if isinstance(data, np.ndarray):
            print(f"Array shape: {data.shape}")
            print("First 5 elements:")
            if data.size > 0:  # Check if array has elements
                if data.ndim == 0:  # Handle 0-dimensional arrays
                    print(np.array([data.item()]))
                else:
                    print(data[:5] if len(data) > 5 else data)
                    
                    print("\nData type:", data.dtype)
                    print("data size:", data.size)
                    print("data.ndim", data.ndim)
            else:
                print("Empty array")
        elif isinstance(data, pd.DataFrame):
            print(f"DataFrame shape: {data.shape}")
            print("First 5 rows:")
            print(data.head(5) if len(data) > 0 else "Empty DataFrame")
        elif isinstance(data, dict):
            print(f"Dictionary with {len(data)} keys")
            print("First 5 items:")
            print(dict(list(data.items())[:5]) if len(data) > 0 else "Empty dictionary")
        elif isinstance(data, list):
            print(f"List with {len(data)} items")
            print("First 5 items:")
            print(data[:5] if len(data) > 0 else "Empty list")
        else:
            print("Data preview:")
            preview = str(data)
            print(preview[:5] + "..." if len(preview) > 5 else preview)
    except Exception as e:
        print(f"Error while printing sample data: {str(e)}")



def generate_dataset_summary(data_summary: Dict[str, Dict[str, Any]]) -> None:
    """Generate comprehensive summary of loaded datasets."""
    print("\n\n=== DATASET SUMMARY ===")
    
    for exchange, files in data_summary.items():
        print(f"\n--- {exchange} Dataset ---")
        
        loaded_files = [f for f, d in files.items() if d is not None]
        missing_files = [f for f, d in files.items() if d is None]
        
        print(f"\nSuccessfully loaded files ({len(loaded_files)}):")
        for file in loaded_files:
            data = files[file]
            print(f"  {file}: {type(data)}")
            if isinstance(data, np.ndarray):
                print(f"    Shape: {data.shape}")
                print(f"    Data type: {data.dtype}")
            elif isinstance(data, pd.DataFrame):
                print(f"    Shape: {data.shape}")
                print(f"    Columns: {list(data.columns)}")
            elif isinstance(data, dict):
                print(f"    Keys: {len(data)}")
        
        if missing_files:
            print(f"\nFailed to load files ({len(missing_files)}): {', '.join(missing_files)}")

def main():
    print("=== Stock Market Dataset Loader ===")
    print("Loading NASDAQ, NYSE, and S&P500 datasets\n")
    
    # Diagnostic information
    print("=== SYSTEM INFO ===")
    print(f"Python version: {sys.version.split()[0]}")
    print(f"NumPy version: {np.__version__}")
    print(f"Pandas version: {pd.__version__}")
    print(f"Working directory: {os.getcwd()}")
    
    if not os.path.exists('dataset'):
        print("\nERROR: 'dataset' folder not found in current directory!")
        return
    
    print("\nContents of dataset folder:", os.listdir('dataset'))
    
    # Initialize data summary structure
    data_summary = {
        'NASDAQ': {},
        'NYSE': {},
        'SP500': {}
    }
    
    # Read all data files
    read_exchange_data('NASDAQ', data_summary)
    # 0 byte for NYSE
    # read_exchange_data('NYSE', data_summary)
    print("\nReading SP500 data...")
    read_sp500_data(data_summary)
    
    # Generate summary report
    generate_dataset_summary(data_summary)
    
    print("\nProcessing complete!")

if __name__ == "__main__":
    main()

=== Stock Market Dataset Loader ===
Loading NASDAQ, NYSE, and S&P500 datasets

=== SYSTEM INFO ===
Python version: 3.11.7
NumPy version: 1.26.4
Pandas version: 2.2.3
Working directory: e:\Fintech\practicum\StockMixer

Contents of dataset folder: ['NASDAQ', 'NYSE', 'read.py', 'SP500']

Processing NASDAQ files in: dataset\NASDAQ

Attempting to load: dataset\NASDAQ\eod_data.pkl
Found dataset\NASDAQ\eod_data.pkl (25547565 bytes)
Successfully loaded eod_data.pkl
Data type: <class 'numpy.ndarray'>
dtype: float32
data size: 6386850
data.ndim 3
Array shape: (1026, 1245, 5)
First 5 elements:
[[[0.270533 0.269522 0.267237 0.263333 0.275333]
  [0.271109 0.269741 0.26782  0.263982 0.271219]
  [0.271822 0.270097 0.268485 0.264723 0.272316]
  ...
  [0.948882 0.967983 0.969176 0.962736 0.951735]
  [0.949321 0.96472  0.968531 0.964939 0.962841]
  [0.952475 0.962416 0.969258 0.967535 0.976964]]

 [[0.23873  0.237522 0.239888 0.240502 0.248031]
  [0.242457 0.238787 0.240177 0.241316 0.252493]
  [0.24692