In [1]:
pip install astropy


Collecting astropy
  Downloading astropy-7.1.0-cp312-cp312-win_amd64.whl.metadata (10 kB)
Collecting pyerfa>=2.0.1.1 (from astropy)
  Downloading pyerfa-2.0.1.5-cp39-abi3-win_amd64.whl.metadata (5.9 kB)
Collecting astropy-iers-data>=0.2025.4.28.0.37.27 (from astropy)
  Downloading astropy_iers_data-0.2025.8.18.0.40.14-py3-none-any.whl.metadata (3.4 kB)
Downloading astropy-7.1.0-cp312-cp312-win_amd64.whl (6.3 MB)
   ---------------------------------------- 0.0/6.3 MB ? eta -:--:--
   - -------------------------------------- 0.3/6.3 MB ? eta -:--:--
   --- ------------------------------------ 0.5/6.3 MB 1.9 MB/s eta 0:00:04
   ------ --------------------------------- 1.0/6.3 MB 2.2 MB/s eta 0:00:03
   ---------- ----------------------------- 1.6/6.3 MB 2.3 MB/s eta 0:00:03
   ------------- -------------------------- 2.1/6.3 MB 2.4 MB/s eta 0:00:02
   ------------------ --------------------- 2.9/6.3 MB 2.7 MB/s eta 0:00:02
   ----------------------- ---------------- 3.7/6.3 MB 2.8 MB/s et

In [None]:
import subprocess
import sys
import warnings
warnings.filterwarnings('ignore')

def install_packages():
    """Install required packages for CDF and FITS file processing"""
    packages = ['cdflib', 'astropy', 'pandas', 'numpy']

    for package in packages:
        try:
            __import__(package)
            print(f"✓ {package} already installed")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f"✓ {package} installed successfully")

install_packages()

import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
from collections import defaultdict

try:
    import cdflib
    from astropy.io import fits
    print("\n🎉 All required libraries imported successfully!")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Please restart your Jupyter kernel and try again.")

def merge_cdf_fits_data(cdf_directory, fits_directory, output_csv="merged_data.csv", 
                       time_aggregation="mean", verbose=True):
    """
    Merge CDF and FITS files based on timestamps in filenames.

    Parameters:
    -----------
    cdf_directory : str
        Path to directory containing CDF files
    fits_directory : str  
        Path to directory containing FITS files
    output_csv : str
        Output CSV filename (default: "merged_data.csv")
    time_aggregation : str
        How to handle multiple time values ('mean', 'first', 'last')
    verbose : bool
        Print detailed processing information

    Returns:
    --------
    pandas.DataFrame
        Merged dataset with standardized variable names
    """

    variables = {
        'timestamp': ['EPOCH', 'Epoch', 'Time', 'epoch_for_cdf_mod'],
        'velocity_x': ['proton_xvelocity', 'V_X', 'VX', 'VELOCITY_X'],
        'velocity_y': ['proton_yvelocity', 'V_Y', 'VY', 'VELOCITY_Y'],
        'velocity_z': ['proton_zvelocity', 'V_Z', 'VZ', 'VELOCITY_Z'],
        'velocity_mag': ['bulk_p', 'proton_bulk_speed', 'V_MAG', 'V_TOTAL', 'SW_SPEED', 'v_p'],
        'proton_density': ['numden_p', 'proton_density', 'N_P', 'PROTON_DENSITY', 'n_p'],
        'proton_temp': ['thermal_p', 'proton_thermal', 'T_P', 'PROTON_TEMP', 'v_t_p'],
        'alpha_density': ['numden_a', 'alpha_density', 'ALPHA_DENSITY', 'n_he'],
        'alpha_speed': ['bulk_a', 'alpha_bulk_speed', 'ALPHA_SPEED', 'v_a'],
        'alpha_temp': ['thermal_a', 'alpha_thermal', 'ALPHA_TEMP', 'v_t_a'],
        'proton_density_uncertainty': ['numden_p_uncer'],
        'proton_speed_uncertainty': ['bulk_p_uncer'],
        'proton_temp_uncertainty': ['thermal_p_uncer'],
        'alpha_density_uncertainty': ['numden_a_uncer'],
        'alpha_speed_uncertainty': ['bulk_a_uncer'],
        'alpha_temp_uncertainty': ['thermal_a_uncer'],
        'spacecraft_x': ['spacecraft_xpos', 'sc_pos_x'],
        'spacecraft_y': ['spacecraft_ypos', 'sc_pos_y'],
        'spacecraft_z': ['spacecraft_zpos', 'sc_pos_z']
    }

    def extract_timestamp_from_filename(filename):
        """Extract YYYY-MM-DD timestamp from filename"""
        patterns = [
            r'(\d{4}-\d{2}-\d{2})',  # YYYY-MM-DD
            r'(\d{4}_\d{2}_\d{2})',  # YYYY_MM_DD
            r'(\d{8})',              # YYYYMMDD
        ]

        for pattern in patterns:
            match = re.search(pattern, filename)
            if match:
                date_str = match.group(1)
                if '_' in date_str:
                    return date_str.replace('_', '-')
                elif len(date_str) == 8:
                    return f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:8]}"
                else:
                    return date_str
        return None

    def aggregate_array_data(data, method="mean"):
        """Aggregate array data using specified method"""
        if data is None:
            return np.nan
            
        if not isinstance(data, (list, np.ndarray)) or len(data) == 0:
            return data

        try:
            # Convert to numpy array if it isn't already
            if not isinstance(data, np.ndarray):
                data = np.array(data)
            
            # Handle multi-dimensional arrays
            if data.ndim > 1:
                data = data.flatten()
            
            # Handle numeric data
            if np.issubdtype(data.dtype, np.number):
                # Remove inf and nan values for aggregation
                finite_data = data[np.isfinite(data)]
                if len(finite_data) == 0:
                    return np.nan
                    
                if method == "mean":
                    return np.nanmean(finite_data)
                elif method == "first":
                    return finite_data[0] if len(finite_data) > 0 else np.nan
                elif method == "last":
                    return finite_data[-1] if len(finite_data) > 0 else np.nan
                else:
                    return np.nanmean(finite_data)
            else:
                # Handle non-numeric data
                return str(data[0]) if len(data) > 0 else ""
        except Exception as e:
            if verbose:
                print(f"Warning: Could not aggregate data: {e}")
            # Return first element as fallback
            try:
                if isinstance(data, (list, np.ndarray)) and len(data) > 0:
                    return str(data[0]) if not isinstance(data[0], (int, float, np.number)) else data[0]
                else:
                    return data
            except:
                return str(data) if data is not None else ""

    def read_cdf_file(filepath):
        """Read CDF file and return data dictionary"""
        try:
            cdf = cdflib.CDF(filepath)
            data = {}
            info = cdf.cdf_info()
            all_vars = info.zVariables + info.rVariables

            for var in all_vars:
                try:
                    var_data = cdf.varget(var)
                    # Handle the case where varget returns None
                    if var_data is not None:
                        data[var] = var_data
                except Exception as e:
                    if verbose:
                        print(f"Warning: Could not read variable {var}: {e}")
                    continue

            cdf.close()
            return data
        except Exception as e:
            if verbose:
                print(f"Error reading CDF file {filepath}: {e}")
            return {}

    def read_fits_file(filepath):
        """Read FITS file and return data dictionary"""
        try:
            data = {}
            with fits.open(filepath, ignore_missing_end=True) as hdul:
                for i, hdu in enumerate(hdul):
                    try:
                        if hasattr(hdu, 'data') and hdu.data is not None:
                            if hasattr(hdu.data, 'names') and hdu.data.names:
                                for name in hdu.data.names:
                                    try:
                                        column_data = hdu.data[name]
                                        # Handle masked arrays
                                        if hasattr(column_data, 'filled'):
                                            column_data = column_data.filled(np.nan)
                                        data[name] = column_data
                                    except Exception as e:
                                        if verbose:
                                            print(f"Warning: Could not read column {name}: {e}")
                                        continue
                            elif hasattr(hdu.data, 'shape') and len(hdu.data.shape) > 0:
                                data[f'HDU_{i}_data'] = hdu.data

                        if hasattr(hdu, 'header'):
                            for key in hdu.header.keys():
                                if key not in ['SIMPLE', 'BITPIX', 'NAXIS', 'NAXIS1', 'NAXIS2', 
                                             'EXTEND', 'COMMENT', 'HISTORY', '']:
                                    try:
                                        data[f'header_{key}'] = hdu.header[key]
                                    except:
                                        continue
                    except Exception as e:
                        if verbose:
                            print(f"Warning: Could not process HDU {i}: {e}")
                        continue
            return data
        except Exception as e:
            if verbose:
                print(f"Error reading FITS file {filepath}: {e}")
            return {}

    def standardize_variable_names(data, variables_mapping):
        """Standardize variable names based on mapping"""
        standardized_data = {}
        # Create a mapping of lowercase keys to original keys
        data_keys_lower = {str(key).lower(): key for key in data.keys() if key is not None}

        for standard_name, possible_names in variables_mapping.items():
            for possible_name in possible_names:
                possible_name_lower = possible_name.lower()
                if possible_name_lower in data_keys_lower:
                    original_key = data_keys_lower[possible_name_lower]
                    standardized_data[standard_name] = data[original_key]
                    break

        # Add unmapped variables with a prefix to avoid conflicts
        mapped_vars_lower = set()
        for possible_names in variables_mapping.values():
            mapped_vars_lower.update([name.lower() for name in possible_names])

        for var_name, var_data in data.items():
            if var_name is not None:
                var_name_lower = str(var_name).lower()
                if var_name_lower not in mapped_vars_lower:
                    standardized_data[var_name] = var_data

        return standardized_data

    def process_directory(directory, file_extensions, read_function):
        """Process all files in directory with given extensions"""
        file_data = {}

        if not os.path.exists(directory):
            print(f"❌ Warning: Directory {directory} does not exist")
            return file_data

        processed_count = 0
        for filename in os.listdir(directory):
            if any(filename.lower().endswith(ext.lower()) for ext in file_extensions):
                timestamp = extract_timestamp_from_filename(filename)
                if timestamp:
                    filepath = os.path.join(directory, filename)
                    if verbose:
                        print(f"📁 Processing {filename} (timestamp: {timestamp})")

                    raw_data = read_function(filepath)
                    if raw_data:
                        standardized_data = standardize_variable_names(raw_data, variables)

                        # Apply aggregation to all data values
                        for key, value in standardized_data.items():
                            standardized_data[key] = aggregate_array_data(value, time_aggregation)

                        file_data[timestamp] = standardized_data
                        processed_count += 1
                    else:
                        if verbose:
                            print(f"⚠️ Warning: No data extracted from {filename}")
                else:
                    if verbose:
                        print(f"⚠️ Warning: Could not extract timestamp from {filename}")

        if verbose:
            print(f"✅ Successfully processed {processed_count} files from {directory}")
        return file_data
    if verbose:
        print("🚀 Starting CDF and FITS File Merger")
        print("=" * 60)
    if not cdf_directory or not isinstance(cdf_directory, str):
        print("❌ Error: Invalid CDF directory path")
        return pd.DataFrame()
    
    if not fits_directory or not isinstance(fits_directory, str):
        print("❌ Error: Invalid FITS directory path")
        return pd.DataFrame()

    cdf_data = process_directory(cdf_directory, ['.cdf'], read_cdf_file)
    fits_data = process_directory(fits_directory, ['.fits', '.fit'], read_fits_file)

    if verbose:
        print(f"\n📊 Found {len(cdf_data)} CDF files with valid timestamps")
        print(f"📊 Found {len(fits_data)} FITS files with valid timestamps")

    common_timestamps = set(cdf_data.keys()) & set(fits_data.keys())
    if verbose:
        print(f"🔗 Found {len(common_timestamps)} common timestamps")

    all_timestamps = sorted(set(cdf_data.keys()) | set(fits_data.keys()))
    merged_data = []

    for timestamp in all_timestamps:
        merged_row = {'date': timestamp}

        if timestamp in cdf_data:
            cdf_row = cdf_data[timestamp]
            for key, value in cdf_row.items():
                merged_row[f'cdf_{key}'] = value

        if timestamp in fits_data:
            fits_row = fits_data[timestamp]
            for key, value in fits_row.items():
                merged_row[f'fits_{key}'] = value

        merged_data.append(merged_row)

    if merged_data:
        df = pd.DataFrame(merged_data)
        if len(df) > 0:
            df = df.sort_values('date').reset_index(drop=True)
    else:
        df = pd.DataFrame()

    if verbose and len(df) > 0:
        print(f"\n✅ Created merged dataset with {len(df)} rows and {len(df.columns)} columns")
        print(f"📅 Date range: {df['date'].min()} to {df['date'].max()}")
    elif verbose:
        print("\n⚠️ No data was merged")

    if len(df) > 0:
        try:
            df.to_csv(output_csv, index=False)
            if verbose:
                print(f"💾 Data saved to {output_csv}")
        except Exception as e:
            print(f"❌ Error saving CSV file: {e}")
    else:
        if verbose:
            print("⚠️ No data to save to CSV")

    return df

print("🎉 Merger function defined successfully!")


def check_directories(cdf_dir, fits_dir):
    """Check what files are in your directories"""
    print("🔍 Checking directories...")

    if os.path.exists(cdf_dir):
        cdf_files = [f for f in os.listdir(cdf_dir) if f.lower().endswith('.cdf')]
        print(f"\n📁 CDF Directory: {cdf_dir}")
        print(f"   Found {len(cdf_files)} CDF files:")
        for f in cdf_files[:5]:  # Show first 5 files
            print(f"   - {f}")
        if len(cdf_files) > 5:
            print(f"   ... and {len(cdf_files)-5} more files")
    else:
        print(f"❌ CDF directory not found: {cdf_dir}")

    if os.path.exists(fits_dir):
        fits_files = [f for f in os.listdir(fits_dir) if f.lower().endswith(('.fits', '.fit'))]
        print(f"\n📁 FITS Directory: {fits_dir}")
        print(f"   Found {len(fits_files)} FITS files:")
        for f in fits_files[:5]:  # Show first 5 files
            print(f"   - {f}")
        if len(fits_files) > 5:
            print(f"   ... and {len(fits_files)-5} more files")
    else:
        print(f"❌ FITS directory not found: {fits_dir}")

def analyze_merged_data(df):
    """Analyze the merged dataset"""
    if df is None or len(df) == 0:
        print("❌ No data to analyze")
        return

    print("📈 Dataset Analysis:")
    print(f"   Shape: {df.shape}")
    if 'date' in df.columns:
        print(f"   Date range: {df['date'].min()} to {df['date'].max()}")

    cdf_cols = [col for col in df.columns if col.startswith('cdf_')]
    fits_cols = [col for col in df.columns if col.startswith('fits_')]

    print(f"   CDF variables: {len(cdf_cols)}")
    print(f"   FITS variables: {len(fits_cols)}")

    standard_vars = ['timestamp', 'velocity_x', 'velocity_y', 'velocity_z', 
                    'proton_density', 'proton_temp', 'velocity_mag']

    print("\n🎯 Standardized variables found:")
    for var in standard_vars:
        cdf_found = f'cdf_{var}' in df.columns
        fits_found = f'fits_{var}' in df.columns
        status = ""
        if cdf_found and fits_found:
            status = "✅ Both CDF & FITS"
        elif cdf_found:
            status = "🔵 CDF only"
        elif fits_found:
            status = "🟡 FITS only"
        else:
            status = "❌ Not found"
        print(f"   {var}: {status}")

print("📝 Helper functions ready!")

print("🚀 READY TO USE!")
print("=" * 50)
print("\n1. UPDATE YOUR PATHS:")
print("   Replace the example paths below with your actual directories")
print("\n2. BASIC USAGE:")
print("   merged_df = merge_cdf_fits_data(cdf_path, fits_path)")
print("\n3. ADVANCED USAGE:")
print("   merged_df = merge_cdf_fits_data(")
print("       cdf_directory=cdf_path,")
print("       fits_directory=fits_path,")
print("       output_csv='output.csv',")
print("       time_aggregation='mean',")
print("       verbose=True")
print("   )")

CDF_DIRECTORY = r"D:\Code Rush\swis_2025Aug22T041821297"
FITS_DIRECTORY = r"D:\Code Rush\suit_2025Aug22T054108230"


print("\n📋 TEMPLATE CODE TO RUN:")
print(f"# Check directories first:")
print(f"check_directories('{CDF_DIRECTORY}', '{FITS_DIRECTORY}')")
print(f"\n# Run the merger:")
print(f"merged_df = merge_cdf_fits_data('{CDF_DIRECTORY}', '{FITS_DIRECTORY}')")
print(f"\n# Analyze results:")
print(f"analyze_merged_data(merged_df)")

print("\n⚠️  REMEMBER TO:")
print("   - Update the directory paths above")
print("   - Make sure your files have timestamps in filenames (YYYY-MM-DD format)")
print("   - Check that both directories exist and contain the right file types")

Merged data saved to merged_output.csv
Total rows: 1
