In [None]:
# Generate comprehensive dataset summary
summary_file = os.path.join(data_dir, 'dataset_summary.txt')

with open(summary_file, 'w') as f:
    f.write("="*70 + "\n")
    f.write("NeurIPS 2025 WEAK LENSING CHALLENGE - DATASET SUMMARY\n")
    f.write("="*70 + "\n\n")
    
    f.write("DATASET CONFIGURATION\n")
    f.write("-" * 70 + "\n")
    f.write(f"Expected Image Dimensions: {IMAGE_HEIGHT} × {IMAGE_WIDTH} pixels\n")
    f.write(f"Resolution: {RESOLUTION_ARCMIN} arcmin/pixel\n")
    f.write(f"Number of Cosmological Models: {NUM_COSMOLOGICAL_MODELS}\n")
    f.write(f"Field: Convergence map of redshift BIN 2 of WIDE12H subfield\n\n")
    
    f.write("FILES LOADED\n")
    f.write("-" * 70 + "\n")
    for name, path in files_to_check.items():
        exists = os.path.exists(path)
        f.write(f"{name}: {'Found' if exists else 'Not found'}\n")
        if exists:
            size_mb = os.path.getsize(path) / (1024 * 1024)
            f.write(f"  Size: {size_mb:.2f} MB\n")
    f.write("\n")
    
    if labels is not None:
        f.write("LABELS\n")
        f.write("-" * 70 + "\n")
        f.write(f"Shape: {labels.shape}\n")
        f.write(f"Data type: {labels.dtype}\n")
        if len(labels.shape) == 2:
            omega_m = labels[:, 0]
            s_8 = labels[:, 1]
            f.write(f"Number of samples: {labels.shape[0]}\n\n")
            
            f.write(f"Ω_m Statistics:\n")
            f.write(f"  Mean:   {np.mean(omega_m):.6f}\n")
            f.write(f"  Std:    {np.std(omega_m):.6f}\n")
            f.write(f"  Min:    {np.min(omega_m):.6f}\n")
            f.write(f"  Max:    {np.max(omega_m):.6f}\n")
            f.write(f"  Median: {np.median(omega_m):.6f}\n\n")
            
            f.write(f"S_8 Statistics:\n")
            f.write(f"  Mean:   {np.mean(s_8):.6f}\n")
            f.write(f"  Std:    {np.std(s_8):.6f}\n")
            f.write(f"  Min:    {np.min(s_8):.6f}\n")
            f.write(f"  Max:    {np.max(s_8):.6f}\n")
            f.write(f"  Median: {np.median(s_8):.6f}\n\n")
            
            correlation = np.corrcoef(omega_m, s_8)[0, 1]
            f.write(f"Correlation (Ω_m, S_8): {correlation:.6f}\n\n")
    
    if kappa is not None:
        f.write("CONVERGENCE MAPS (TRAINING)\n")
        f.write("-" * 70 + "\n")
        f.write(f"Shape: {kappa.shape}\n")
        f.write(f"Data type: {kappa.dtype}\n")
        f.write(f"Memory usage: {kappa.nbytes / (1024**2):.2f} MB\n\n")
        f.write(f"Global Statistics:\n")
        f.write(f"  Mean: {np.mean(kappa):.6f}\n")
        f.write(f"  Std:  {np.std(kappa):.6f}\n")
        f.write(f"  Min:  {np.min(kappa):.6f}\n")
        f.write(f"  Max:  {np.max(kappa):.6f}\n")
        f.write(f"  Contains NaN: {np.any(np.isnan(kappa))}\n")
        f.write(f"  Contains Inf: {np.any(np.isinf(kappa))}\n\n")
        
        if len(kappa.shape) == 3:
            sample_means = np.mean(kappa, axis=(1, 2))
            sample_stds = np.std(kappa, axis=(1, 2))
            f.write(f"Per-Sample Statistics:\n")
            f.write(f"  Mean of means: {np.mean(sample_means):.6f}\n")
            f.write(f"  Mean of stds:  {np.mean(sample_stds):.6f}\n\n")
    
    if kappa_test is not None:
        f.write("CONVERGENCE MAPS (TEST - NOISY)\n")
        f.write("-" * 70 + "\n")
        f.write(f"Shape: {kappa_test.shape}\n")
        f.write(f"Data type: {kappa_test.dtype}\n")
        f.write(f"Memory usage: {kappa_test.nbytes / (1024**2):.2f} MB\n\n")
        f.write(f"Global Statistics:\n")
        f.write(f"  Mean: {np.mean(kappa_test):.6f}\n")
        f.write(f"  Std:  {np.std(kappa_test):.6f}\n")
        f.write(f"  Min:  {np.min(kappa_test):.6f}\n")
        f.write(f"  Max:  {np.max(kappa_test):.6f}\n\n")
    
    if mask is not None:
        f.write("MASK\n")
        f.write("-" * 70 + "\n")
        f.write(f"Shape: {mask.shape}\n")
        f.write(f"Data type: {mask.dtype}\n")
        f.write(f"Unique values: {np.unique(mask)}\n")
        if len(mask.shape) == 2:
            valid_pixels = np.sum(mask > 0)
            total_pixels = mask.size
            f.write(f"Valid pixels: {valid_pixels:,} ({100*valid_pixels/total_pixels:.2f}%)\n")
            f.write(f"Invalid pixels: {total_pixels - valid_pixels:,}\n\n")
    
    f.write("="*70 + "\n")
    f.write("Analysis completed successfully!\n")
    f.write(f"Generated: {np.datetime64('now')}\n")
    f.write("="*70 + "\n")

print(f"\nDataset summary saved to: {summary_file}")
print("\n" + "="*70)
print("DATASET ANALYSIS COMPLETE!")
print("="*70)

## 10. Save Dataset Summary

In [None]:
if mask is not None:
    print("="*60)
    print("MASK ANALYSIS")
    print("="*60)
    
    if len(mask.shape) == 2:
        # Single mask
        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
        
        # Visualize mask
        im = axes[0].imshow(mask, cmap='gray', aspect='auto')
        axes[0].set_title('Survey Mask', fontsize=13)
        axes[0].set_xlabel('Width (pixels)')
        axes[0].set_ylabel('Height (pixels)')
        plt.colorbar(im, ax=axes[0], label='Mask Value')
        
        # Show masked region on a sample kappa map
        if kappa is not None:
            if len(kappa.shape) == 3:
                sample_map = kappa[0].copy()
            else:
                sample_map = kappa.copy()
            
            masked_kappa = np.ma.masked_where(mask == 0, sample_map)
            im2 = axes[1].imshow(masked_kappa, cmap='RdBu_r', aspect='auto')
            axes[1].set_title('Convergence Map with Mask Applied', fontsize=13)
            axes[1].set_xlabel('Width (pixels)')
            axes[1].set_ylabel('Height (pixels)')
            plt.colorbar(im2, ax=axes[1], label='κ (masked)')
        
        plt.suptitle('Survey Mask Visualization', fontsize=14)
        plt.tight_layout()
        plt.show()
        
        # Mask coverage statistics
        valid_fraction = np.sum(mask > 0) / mask.size
        print(f"\nMask Coverage:")
        print(f"  Valid pixels:   {np.sum(mask > 0):,}")
        print(f"  Invalid pixels: {np.sum(mask == 0):,}")
        print(f"  Total pixels:   {mask.size:,}")
        print(f"  Valid fraction: {valid_fraction:.4%}")
        
    print("="*60)
else:
    print("No mask data available for analysis.")

## 9. Visualize Mask Properties

In [None]:
if kappa is not None and kappa_test is not None:
    print("="*60)
    print("TRAINING vs TEST SET COMPARISON")
    print("="*60)
    
    # Check shapes
    print(f"\nShape Comparison:")
    print(f"  Training: {kappa.shape}")
    print(f"  Test:     {kappa_test.shape}")
    
    if len(kappa.shape) == 3 and len(kappa_test.shape) == 3:
        print(f"\n  Training samples: {kappa.shape[0]}")
        print(f"  Test samples:     {kappa_test.shape[0]}")
        
        # Statistical comparison
        train_mean = np.mean(kappa)
        train_std = np.std(kappa)
        test_mean = np.mean(kappa_test)
        test_std = np.std(kappa_test)
        
        print(f"\nGlobal Statistics:")
        print(f"  Training - Mean: {train_mean:.6f}, Std: {train_std:.6f}")
        print(f"  Test     - Mean: {test_mean:.6f}, Std: {test_std:.6f}")
        print(f"  Difference (Mean): {abs(train_mean - test_mean):.6f}")
        print(f"  Difference (Std):  {abs(train_std - test_std):.6f}")
        
        # Distribution comparison
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Histogram comparison
        axes[0].hist(kappa.flatten(), bins=100, alpha=0.5, label='Training', 
                    density=True, edgecolor='none')
        axes[0].hist(kappa_test.flatten(), bins=100, alpha=0.5, label='Test (Noisy)', 
                    density=True, edgecolor='none')
        axes[0].set_xlabel('κ (Convergence Value)')
        axes[0].set_ylabel('Density')
        axes[0].set_title('Pixel Value Distribution Comparison')
        axes[0].legend()
        axes[0].set_yscale('log')
        
        # Box plot comparison
        data_to_plot = [kappa.flatten()[::100], kappa_test.flatten()[::100]]  # Subsample for speed
        axes[1].boxplot(data_to_plot, labels=['Training', 'Test (Noisy)'])
        axes[1].set_ylabel('κ (Convergence Value)')
        axes[1].set_title('Distribution Box Plot Comparison')
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Per-sample mean comparison
        if kappa.shape[0] > 0 and kappa_test.shape[0] > 0:
            train_sample_means = np.mean(kappa, axis=(1, 2))
            test_sample_means = np.mean(kappa_test, axis=(1, 2))
            
            fig, ax = plt.subplots(1, 1, figsize=(12, 6))
            ax.hist(train_sample_means, bins=30, alpha=0.6, label='Training', 
                   edgecolor='black')
            ax.hist(test_sample_means, bins=30, alpha=0.6, label='Test (Noisy)', 
                   edgecolor='black')
            ax.set_xlabel('Mean κ per Sample')
            ax.set_ylabel('Frequency')
            ax.set_title('Per-Sample Mean Comparison: Training vs Test')
            ax.legend()
            plt.tight_layout()
            plt.show()
    
    print("="*60)
else:
    print("Both training and test data needed for comparison.")

## 8. Compare Training and Test Set Properties

In [None]:
# Visualize sample test maps
if kappa_test is not None:
    if len(kappa_test.shape) == 3:
        n_samples = min(4, kappa_test.shape[0])
        fig, axes = plt.subplots(1, n_samples, figsize=(16, 4))
        if n_samples == 1:
            axes = [axes]
        
        for i in range(n_samples):
            im = axes[i].imshow(kappa_test[i], cmap='RdBu_r', aspect='auto')
            axes[i].set_title(f'Test Sample {i+1}', fontsize=11)
            axes[i].set_xlabel('Width')
            axes[i].set_ylabel('Height')
            plt.colorbar(im, ax=axes[i], label='κ')
        
        plt.suptitle('Sample Test Convergence Maps (Noisy)', fontsize=14)
        plt.tight_layout()
        plt.show()
    elif len(kappa_test.shape) == 2:
        fig, ax = plt.subplots(1, 1, figsize=(12, 8))
        im = ax.imshow(kappa_test, cmap='RdBu_r', aspect='auto')
        ax.set_title('Test Convergence Map (Noisy)', fontsize=14)
        ax.set_xlabel('Width (pixels)')
        ax.set_ylabel('Height (pixels)')
        plt.colorbar(im, ax=ax, label='κ (convergence)')
        plt.tight_layout()
        plt.show()
else:
    print("No test data to visualize.")

In [None]:
# Load noisy test data
if os.path.exists(kappa_noisy_test_file):
    kappa_test = np.load(kappa_noisy_test_file)
    print("Test data (noisy) loaded successfully!")
    print(f"  Shape: {kappa_test.shape}")
    print(f"  Data type: {kappa_test.dtype}")
    print(f"  Memory usage: {kappa_test.nbytes / (1024**2):.2f} MB")
    print(f"\nStatistics:")
    print(f"  Min: {np.min(kappa_test):.6f}")
    print(f"  Max: {np.max(kappa_test):.6f}")
    print(f"  Mean: {np.mean(kappa_test):.6f}")
    print(f"  Std: {np.std(kappa_test):.6f}")
    print(f"  Contains NaN: {np.any(np.isnan(kappa_test))}")
    print(f"  Contains Inf: {np.any(np.isinf(kappa_test))}")
else:
    print("Test data file not found!")
    kappa_test = None

## 7. Load and Explore Test Data

In [None]:
if kappa is not None:
    print("="*60)
    print("CONVERGENCE MAP STATISTICS")
    print("="*60)
    
    if len(kappa.shape) == 3:
        n_samples = kappa.shape[0]
        print(f"\nNumber of samples: {n_samples}")
        print(f"Image dimensions: {kappa.shape[1]} × {kappa.shape[2]}")
        
        # Compute statistics for each sample
        sample_means = np.mean(kappa, axis=(1, 2))
        sample_stds = np.std(kappa, axis=(1, 2))
        sample_skewness = stats.skew(kappa.reshape(n_samples, -1), axis=1)
        sample_kurtosis = stats.kurtosis(kappa.reshape(n_samples, -1), axis=1)
        
        print(f"\nPer-Sample Statistics (across {n_samples} samples):")
        print(f"  Mean values:")
        print(f"    Mean: {np.mean(sample_means):.6f}")
        print(f"    Std:  {np.std(sample_means):.6f}")
        print(f"    Range: [{np.min(sample_means):.6f}, {np.max(sample_means):.6f}]")
        
        print(f"\n  Standard deviations:")
        print(f"    Mean: {np.mean(sample_stds):.6f}")
        print(f"    Std:  {np.std(sample_stds):.6f}")
        print(f"    Range: [{np.min(sample_stds):.6f}, {np.max(sample_stds):.6f}]")
        
        print(f"\n  Skewness:")
        print(f"    Mean: {np.mean(sample_skewness):.6f}")
        print(f"    Std:  {np.std(sample_skewness):.6f}")
        
        print(f"\n  Kurtosis:")
        print(f"    Mean: {np.mean(sample_kurtosis):.6f}")
        print(f"    Std:  {np.std(sample_kurtosis):.6f}")
        
        # Visualize distributions
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        
        axes[0, 0].hist(sample_means, bins=30, alpha=0.7, edgecolor='black')
        axes[0, 0].set_xlabel('Mean κ per Sample')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].set_title('Distribution of Sample Means')
        
        axes[0, 1].hist(sample_stds, bins=30, alpha=0.7, edgecolor='black', color='orange')
        axes[0, 1].set_xlabel('Std Dev κ per Sample')
        axes[0, 1].set_ylabel('Frequency')
        axes[0, 1].set_title('Distribution of Sample Standard Deviations')
        
        axes[1, 0].hist(sample_skewness, bins=30, alpha=0.7, edgecolor='black', color='green')
        axes[1, 0].set_xlabel('Skewness per Sample')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].set_title('Distribution of Sample Skewness')
        
        axes[1, 1].hist(sample_kurtosis, bins=30, alpha=0.7, edgecolor='black', color='red')
        axes[1, 1].set_xlabel('Kurtosis per Sample')
        axes[1, 1].set_ylabel('Frequency')
        axes[1, 1].set_title('Distribution of Sample Kurtosis')
        
        plt.suptitle('Statistical Properties of Convergence Maps', fontsize=15)
        plt.tight_layout()
        plt.show()
        
        # Pixel value distribution (pooled across all samples)
        fig, ax = plt.subplots(1, 1, figsize=(12, 6))
        ax.hist(kappa.flatten(), bins=100, alpha=0.7, edgecolor='black', log=True)
        ax.set_xlabel('κ (Convergence Value)')
        ax.set_ylabel('Frequency (log scale)')
        ax.set_title('Overall Pixel Value Distribution (All Samples)')
        ax.axvline(0, color='red', linestyle='--', label='Zero')
        ax.legend()
        plt.tight_layout()
        plt.show()
        
    elif len(kappa.shape) == 2:
        # Single map statistics
        print(f"\nImage dimensions: {kappa.shape[0]} × {kappa.shape[1]}")
        print(f"Total pixels: {kappa.size}")
        
        print(f"\nStatistics:")
        print(f"  Mean:     {np.mean(kappa):.6f}")
        print(f"  Std Dev:  {np.std(kappa):.6f}")
        print(f"  Skewness: {stats.skew(kappa.flatten()):.6f}")
        print(f"  Kurtosis: {stats.kurtosis(kappa.flatten()):.6f}")
        
        # Pixel value distribution
        fig, ax = plt.subplots(1, 1, figsize=(12, 6))
        ax.hist(kappa.flatten(), bins=100, alpha=0.7, edgecolor='black')
        ax.set_xlabel('κ (Convergence Value)')
        ax.set_ylabel('Frequency')
        ax.set_title('Pixel Value Distribution')
        ax.axvline(0, color='red', linestyle='--', label='Zero')
        ax.axvline(np.mean(kappa), color='green', linestyle='--', label='Mean')
        ax.legend()
        plt.tight_layout()
        plt.show()
    
    print("="*60)
else:
    print("No convergence maps available for statistical analysis.")

## 6. Examine Data Statistics

In [None]:
if labels is not None and len(labels.shape) == 2:
    # Extract Omega_m and S_8
    omega_m = labels[:, 0]
    s_8 = labels[:, 1]
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Histogram for Omega_m
    axes[0, 0].hist(omega_m, bins=30, alpha=0.7, edgecolor='black')
    axes[0, 0].set_xlabel('Ω_m (Matter Density Fraction)', fontsize=12)
    axes[0, 0].set_ylabel('Frequency', fontsize=12)
    axes[0, 0].set_title('Distribution of Ω_m', fontsize=13)
    axes[0, 0].axvline(np.mean(omega_m), color='red', linestyle='--', 
                       label=f'Mean: {np.mean(omega_m):.4f}')
    axes[0, 0].legend()
    
    # Histogram for S_8
    axes[0, 1].hist(s_8, bins=30, alpha=0.7, edgecolor='black', color='orange')
    axes[0, 1].set_xlabel('S_8 (Matter Fluctuation Amplitude)', fontsize=12)
    axes[0, 1].set_ylabel('Frequency', fontsize=12)
    axes[0, 1].set_title('Distribution of S_8', fontsize=13)
    axes[0, 1].axvline(np.mean(s_8), color='red', linestyle='--', 
                       label=f'Mean: {np.mean(s_8):.4f}')
    axes[0, 1].legend()
    
    # Scatter plot: Omega_m vs S_8
    scatter = axes[1, 0].scatter(omega_m, s_8, alpha=0.6, s=20, c=range(len(omega_m)), 
                                 cmap='viridis')
    axes[1, 0].set_xlabel('Ω_m', fontsize=12)
    axes[1, 0].set_ylabel('S_8', fontsize=12)
    axes[1, 0].set_title('Joint Distribution: Ω_m vs S_8', fontsize=13)
    axes[1, 0].grid(True, alpha=0.3)
    plt.colorbar(scatter, ax=axes[1, 0], label='Sample Index')
    
    # 2D histogram / density plot
    axes[1, 1].hist2d(omega_m, s_8, bins=20, cmap='YlOrRd')
    axes[1, 1].set_xlabel('Ω_m', fontsize=12)
    axes[1, 1].set_ylabel('S_8', fontsize=12)
    axes[1, 1].set_title('2D Density: Ω_m vs S_8', fontsize=13)
    
    plt.suptitle('Cosmological Parameter Distributions', fontsize=15, y=1.00)
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\n" + "="*60)
    print("COSMOLOGICAL PARAMETER STATISTICS")
    print("="*60)
    print(f"\nΩ_m (Matter Density Fraction):")
    print(f"  Mean:       {np.mean(omega_m):.6f}")
    print(f"  Std Dev:    {np.std(omega_m):.6f}")
    print(f"  Min:        {np.min(omega_m):.6f}")
    print(f"  Max:        {np.max(omega_m):.6f}")
    print(f"  Median:     {np.median(omega_m):.6f}")
    print(f"  25th %ile:  {np.percentile(omega_m, 25):.6f}")
    print(f"  75th %ile:  {np.percentile(omega_m, 75):.6f}")
    
    print(f"\nS_8 (Matter Fluctuation Amplitude):")
    print(f"  Mean:       {np.mean(s_8):.6f}")
    print(f"  Std Dev:    {np.std(s_8):.6f}")
    print(f"  Min:        {np.min(s_8):.6f}")
    print(f"  Max:        {np.max(s_8):.6f}")
    print(f"  Median:     {np.median(s_8):.6f}")
    print(f"  25th %ile:  {np.percentile(s_8, 25):.6f}")
    print(f"  75th %ile:  {np.percentile(s_8, 75):.6f}")
    
    # Correlation
    correlation = np.corrcoef(omega_m, s_8)[0, 1]
    print(f"\nCorrelation (Ω_m, S_8): {correlation:.6f}")
    print("="*60)
else:
    print("Labels not available or in unexpected format.")

## 5. Analyze Cosmological Parameter Distribution

In [None]:
if kappa is not None:
    # Determine the shape and select samples to visualize
    if len(kappa.shape) == 3:
        # Multiple samples: (n_samples, height, width)
        n_samples = min(6, kappa.shape[0])
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        axes = axes.flatten()
        
        for i in range(n_samples):
            im = axes[i].imshow(kappa[i], cmap='RdBu_r', aspect='auto')
            if labels is not None and len(labels.shape) == 2:
                axes[i].set_title(f'Sample {i+1}\nΩ_m={labels[i,0]:.4f}, S_8={labels[i,1]:.4f}', 
                                fontsize=10)
            else:
                axes[i].set_title(f'Sample {i+1}', fontsize=10)
            axes[i].set_xlabel('Width (pixels)')
            axes[i].set_ylabel('Height (pixels)')
            plt.colorbar(im, ax=axes[i], label='κ (convergence)')
        
        plt.suptitle('Sample Weak Lensing Convergence Maps', fontsize=14, y=1.00)
        plt.tight_layout()
        plt.show()
        
    elif len(kappa.shape) == 2:
        # Single map: (height, width)
        fig, ax = plt.subplots(1, 1, figsize=(12, 8))
        im = ax.imshow(kappa, cmap='RdBu_r', aspect='auto')
        ax.set_title('Weak Lensing Convergence Map', fontsize=14)
        ax.set_xlabel('Width (pixels)')
        ax.set_ylabel('Height (pixels)')
        plt.colorbar(im, ax=ax, label='κ (convergence)')
        plt.tight_layout()
        plt.show()
else:
    print("No convergence maps to visualize.")

## 4. Visualize Sample Convergence Maps

In [None]:
# Load mask
if os.path.exists(mask_file):
    mask = np.load(mask_file)
    print("Mask loaded successfully!")
    print(f"  Shape: {mask.shape}")
    print(f"  Data type: {mask.dtype}")
    print(f"\nMask Statistics:")
    print(f"  Unique values: {np.unique(mask)}")
    if len(mask.shape) == 2:
        valid_pixels = np.sum(mask > 0)
        total_pixels = mask.shape[0] * mask.shape[1]
        print(f"  Valid pixels: {valid_pixels} ({100*valid_pixels/total_pixels:.2f}%)")
        print(f"  Total pixels: {total_pixels}")
else:
    print("Mask file not found!")
    mask = None

In [4]:
# Load convergence maps (kappa)
if os.path.exists(kappa_file):
    kappa = np.load(kappa_file)
    print("Convergence maps (kappa) loaded successfully!")
    print(f"  Shape: {kappa.shape}")
    print(f"  Data type: {kappa.dtype}")
    print(f"  Memory usage: {kappa.nbytes / (1024**2):.2f} MB")
    print(f"\nStatistics:")
    print(f"  Min: {np.min(kappa):.6f}")
    print(f"  Max: {np.max(kappa):.6f}")
    print(f"  Mean: {np.mean(kappa):.6f}")
    print(f"  Std: {np.std(kappa):.6f}")
    print(f"  Contains NaN: {np.any(np.isnan(kappa))}")
    print(f"  Contains Inf: {np.any(np.isinf(kappa))}")
else:
    print("Convergence maps file not found!")
    kappa = None

Convergence maps (kappa) loaded successfully!
  Shape: (101, 256, 132019)
  Data type: float16
  Memory usage: 6510.70 MB

Statistics:
  Min: -0.120117
  Min: -0.120117
  Max: 1.766602
  Max: 1.766602
  Mean: -0.000273
  Mean: -0.000273
  Std: inf
  Std: inf
  Contains NaN: False
  Contains NaN: False
  Contains Inf: False
  Contains Inf: False


In [3]:
# Load labels
if os.path.exists(label_file):
    labels = np.load(label_file)
    print("Labels loaded successfully!")
    print(f"  Shape: {labels.shape}")
    print(f"  Data type: {labels.dtype}")
    print(f"  Number of samples: {labels.shape[0] if len(labels.shape) > 0 else 'N/A'}")
    if len(labels.shape) == 2:
        print(f"  Number of parameters: {labels.shape[1]}")
        print(f"\nFirst 5 samples:")
        print(labels[:5])
    else:
        print(f"  First 10 values: {labels[:10]}")
else:
    print("Labels file not found!")
    labels = None

Labels loaded successfully!
  Shape: (101, 256, 5)
  Data type: float64
  Number of samples: 101
  First 10 values: [[[ 3.00000000e-01  8.00000000e-01  8.30528666e+00  2.45921191e-02
    8.80345858e-03]
  [ 3.00000000e-01  8.00000000e-01  7.78855672e+00  4.88651170e-03
   -1.34586411e-02]
  [ 3.00000000e-01  8.00000000e-01  7.50279072e+00  1.50710039e-02
   -1.23787020e-03]
  ...
  [ 3.00000000e-01  8.00000000e-01  7.50599899e+00  2.31751372e-03
   -2.48642935e-02]
  [ 3.00000000e-01  8.00000000e-01  7.77596020e+00  1.74759659e-02
    4.25011182e-03]
  [ 3.00000000e-01  8.00000000e-01  8.30893765e+00  1.10575475e-02
   -4.33534023e-03]]

 [[ 1.95000000e-01  9.55216298e-01  8.30528666e+00  2.45921191e-02
    8.80345858e-03]
  [ 1.95000000e-01  9.55216298e-01  7.78855672e+00  4.88651170e-03
   -1.34586411e-02]
  [ 1.95000000e-01  9.55216298e-01  7.50279072e+00  1.50710039e-02
   -1.23787020e-03]
  ...
  [ 1.95000000e-01  9.55216298e-01  7.50599899e+00  2.31751372e-03
   -2.48642935e-02]


## 3. Load and Explore Training Data

In [2]:
# Data directory
data_dir = r'c:\ML\Challenges\NeurIPS_2025'

# File paths
label_file = os.path.join(data_dir, 'label.npy')
kappa_file = os.path.join(data_dir, 'WIDE12H_bin2_2arcmin_kappa.npy')
kappa_noisy_test_file = os.path.join(data_dir, 'WIDE12H_bin2_2arcmin_kappa_noisy_test.npy')
mask_file = os.path.join(data_dir, 'WIDE12H_bin2_2arcmin_mask.npy')

# Configuration
IMAGE_HEIGHT = 1424
IMAGE_WIDTH = 176
RESOLUTION_ARCMIN = 2
NUM_COSMOLOGICAL_MODELS = 101

# Check which files exist
files_to_check = {
    'Labels': label_file,
    'Kappa (convergence maps)': kappa_file,
    'Kappa Noisy Test': kappa_noisy_test_file,
    'Mask': mask_file
}

print("Dataset Configuration:")
print(f"  Image Dimensions: {IMAGE_HEIGHT} × {IMAGE_WIDTH} pixels")
print(f"  Resolution: {RESOLUTION_ARCMIN} arcmin/pixel")
print(f"  Cosmological Models: {NUM_COSMOLOGICAL_MODELS}")
print("\nFile Availability:")
for name, path in files_to_check.items():
    exists = os.path.exists(path)
    print(f"  {name}: {'✓ Found' if exists else '✗ Not found'}")
    if exists:
        size_mb = os.path.getsize(path) / (1024 * 1024)
        print(f"    Size: {size_mb:.2f} MB")

Dataset Configuration:
  Image Dimensions: 1424 × 176 pixels
  Resolution: 2 arcmin/pixel
  Cosmological Models: 101

File Availability:
  Labels: ✓ Found
    Size: 0.99 MB
  Kappa (convergence maps): ✓ Found
    Size: 6510.70 MB
  Kappa Noisy Test: ✓ Found
    Size: 1007.23 MB
  Mask: ✓ Found
    Size: 0.24 MB


## 2. Define Data Paths and Configuration

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"NumPy version: {np.__version__}")

Libraries imported successfully!
NumPy version: 1.26.4


## 1. Import Required Libraries

# NeurIPS 2025 Weak Lensing Challenge - Dataset Analysis

This notebook analyzes the weak lensing convergence map dataset from the Hyper Suprime-Cam (HSC) survey simulation for cosmological parameter estimation.

## Dataset Overview
- **Image Dimensions**: 1424 × 176 pixels
- **Resolution**: 2 arcmin per pixel
- **Field**: Convergence map of redshift BIN 2 of WIDE12H subfield in HSC Y3
- **Cosmological Models**: 101 different spatially-flat ΛCDM models
- **Target Parameters**: Ω_m (matter density fraction) and S_8 (amplitude of matter fluctuations)