In [1]:
import pandas as pd 
import numpy as np 
import os 
import h5py 
from pathlib import Path
import gc

In [2]:
file_path = "/home/sid/DeepView/team_aase/code/processed_scans.h5"

with h5py.File(file_path, 'r') as hdf:
    # List all datasets in the HDF5 file
    print("Datasets in HDF5 file:", list(hdf.keys()))

    # Inspect a specific dataset, e.g., first dataset
    first_dataset_name = list(hdf.keys())[0]  # Access the first dataset name
    data = hdf[first_dataset_name][:]  # Load the data from the first dataset
    print("Shape of first dataset:", data.shape)

Datasets in HDF5 file: ['scan_001.npy', 'scan_002.npy', 'scan_003.npy', 'scan_004.npy', 'scan_005.npy', 'scan_007.npy', 'scan_008.npy', 'scan_009.npy', 'scan_010.npy', 'scan_011.npy', 'scan_012.npy', 'scan_013.npy', 'scan_014.npy', 'scan_015.npy', 'scan_016.npy', 'scan_017.npy', 'scan_018.npy', 'scan_019.npy', 'scan_020.npy', 'scan_021.npy', 'scan_022.npy', 'scan_023.npy', 'scan_024.npy', 'scan_025.npy', 'scan_026.npy', 'scan_027.npy', 'scan_028.npy', 'scan_029.npy', 'scan_030.npy', 'scan_031.npy', 'scan_032.npy', 'scan_033.npy', 'scan_034.npy', 'scan_035.npy', 'scan_036.npy', 'scan_037.npy', 'scan_038.npy', 'scan_039.npy', 'scan_040.npy', 'scan_041.npy', 'scan_042.npy', 'scan_043.npy', 'scan_044.npy', 'scan_045.npy', 'scan_046.npy', 'scan_047.npy', 'scan_048.npy', 'scan_049.npy', 'scan_050.npy', 'scan_051.npy', 'scan_052.npy', 'scan_053.npy', 'scan_054.npy', 'scan_055.npy', 'scan_056.npy', 'scan_057.npy', 'scan_058.npy', 'scan_059.npy', 'scan_060.npy', 'scan_061.npy', 'scan_062.npy', 

In [2]:
def npy_files_to_df(directory_path, batch_size=10, save_intermediate=True, output_path='output.csv'):
    dir_path = Path(directory_path)
    
    # Get all .npy files in the directory
    npy_files = list(dir_path.glob('*.npy'))
    
    if not npy_files:
        raise ValueError(f"No .npy files found in {directory_path}")
    
    # Process files in batches
    for i in range(0, len(npy_files), batch_size):
        batch_files = npy_files[i:i + batch_size]
        batch_data = []
        
        print(f"Processing batch {i//batch_size + 1}/{(len(npy_files)-1)//batch_size + 1}")
        
        # Process each file in the current batch
        for file_path in batch_files:
            try:
                # Load and process one file at a time
                data = np.load(file_path)
                
                # Get shape information before flattening
                shape_info = data.shape
                
                # Create row dict with filename and shape info
                row_dict = {
                    'filename': file_path.name,
                    'original_shape': str(shape_info),
                    'total_points': data.size
                }
                
                # Instead of storing all values, calculate some summary statistics
                row_dict.update({
                    'mean': np.mean(data),
                    'std': np.std(data),
                    'min': np.min(data),
                    'max': np.max(data),
                    'median': np.median(data)
                })
                
                # If you need actual mesh data points, you can add them selectively
                # For example, store only the first 1000 points:
                flattened = data.ravel()
                for j, val in enumerate(flattened[:1000]):
                    row_dict[f'point_{j}'] = val
                
                batch_data.append(row_dict)
                
                # Clear memory
                del data
                del flattened
                gc.collect()
                
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
                continue
        
        # Create DataFrame from batch
        df_batch = pd.DataFrame(batch_data)
        
        # Save intermediate results if requested
        if save_intermediate:
            mode = 'w' if i == 0 else 'a'
            header = i == 0
            df_batch.to_csv(output_path, mode=mode, header=header, index=False)
            print(f"Saved batch to {output_path}")
        
        # If not saving intermediate results, return the final DataFrame
        if not save_intermediate and i + batch_size >= len(npy_files):
            return df_batch
        
        # Clear memory
        del df_batch
        gc.collect()
    
    print("Processing complete!")
    
    if save_intermediate:
        print(f"Final results saved to: {output_path}")
        return pd.read_csv(output_path)

In [3]:
npy_files_to_df("/home/sid/DeepView/team_aase/code/processed_scans")

Processing batch 1/9
Saved batch to output.csv
Processing batch 2/9
Saved batch to output.csv
Processing batch 3/9
Saved batch to output.csv
Processing batch 4/9
Saved batch to output.csv
Processing batch 5/9
Saved batch to output.csv
Processing batch 6/9
Saved batch to output.csv
Processing batch 7/9
Saved batch to output.csv
Processing batch 8/9
Saved batch to output.csv
Processing batch 9/9
Saved batch to output.csv
Processing complete!
Final results saved to: output.csv


Unnamed: 0,filename,original_shape,total_points,mean,std,min,max,median,point_0,point_1,...,point_990,point_991,point_992,point_993,point_994,point_995,point_996,point_997,point_998,point_999
0,scan_088.npy,"(1280, 768, 768)",754974720,20.040806,476.423220,0.0,42062.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,scan_020.npy,"(1280, 768, 768)",754974720,11.838695,245.777600,0.0,34938.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,scan_003.npy,"(1280, 768, 768)",754974720,15.833298,320.785950,0.0,38782.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,scan_015.npy,"(1280, 768, 768)",754974720,58.582146,441.118130,0.0,36239.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,scan_032.npy,"(1280, 768, 768)",754974720,1.046440,65.670166,0.0,29785.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,scan_071.npy,"(1280, 768, 768)",754974720,10.143373,221.889720,0.0,33784.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,scan_012.npy,"(1280, 768, 768)",754974720,31.033009,216.814200,0.0,37407.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,scan_080.npy,"(1280, 768, 768)",754974720,11.671384,261.684600,0.0,38390.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,scan_050.npy,"(1280, 768, 768)",754974720,7.075703,157.696870,0.0,30894.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
pd.read_csv('output.csv')

Unnamed: 0,filename,original_shape,total_points,mean,std,min,max,median,point_0,point_1,...,point_990,point_991,point_992,point_993,point_994,point_995,point_996,point_997,point_998,point_999
0,scan_088.npy,"(1280, 768, 768)",754974720,20.040806,476.423220,0.0,42062.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,scan_020.npy,"(1280, 768, 768)",754974720,11.838695,245.777600,0.0,34938.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,scan_003.npy,"(1280, 768, 768)",754974720,15.833298,320.785950,0.0,38782.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,scan_015.npy,"(1280, 768, 768)",754974720,58.582146,441.118130,0.0,36239.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,scan_032.npy,"(1280, 768, 768)",754974720,1.046440,65.670166,0.0,29785.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,scan_071.npy,"(1280, 768, 768)",754974720,10.143373,221.889720,0.0,33784.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,scan_012.npy,"(1280, 768, 768)",754974720,31.033009,216.814200,0.0,37407.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,scan_080.npy,"(1280, 768, 768)",754974720,11.671384,261.684600,0.0,38390.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,scan_050.npy,"(1280, 768, 768)",754974720,7.075703,157.696870,0.0,30894.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
