In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Define paths and parameters
base_path = Path('./clin_test_100k')
d_low = 1.5
d_high = 0.8
n_files = 10  # Number of files to process

# Create list of file paths
names = [base_path / f'clin{d_low}_{d_high}_{i}.npz' for i in range(n_files)]

# Initialize lists to store data
data_lists = {
    'I_high': [],
    'ind_low': [],
    'ind_high': [],
    'groups': [],
    'unit_cells': []
}

# Load and process each file
for name in names:
    if name.exists():
        data = np.load(name, allow_pickle=True)['db']
        for entry in data:
            # Store arrays
            data_lists['I_high'].append(entry['I_high'])
            data_lists['ind_low'].append(entry['ind_low'])
            data_lists['ind_high'].append(entry['ind_high'])
            
            # Store group info
            data_lists['groups'].append(entry['structure_params']['group'])
            
            # Store unit cell info
            unit_cell = {
                'a': entry['structure_params']['cell_a'],
                'b': entry['structure_params']['cell_b'],
                'c': entry['structure_params']['cell_c'],
                'alpha': entry['structure_params']['cell_alpha'],
                'beta': entry['structure_params']['cell_beta'],
                'gamma': entry['structure_params']['cell_gamma'],
                'volume': entry['structure_params']['cell_volume'],
                'n_atoms': entry['structure_params']['structure_n_atoms']
            }
            data_lists['unit_cells'].append(unit_cell)

# Create DataFrame with shapes and metadata
df = pd.DataFrame({
    'group': data_lists['groups'],
    'I_high_shape': [arr.shape[0] for arr in data_lists['I_high']],
    'ind_low_shape': [arr.shape[0] for arr in data_lists['ind_low']],
    'ind_high_shape': [arr.shape[0] for arr in data_lists['ind_high']],
    'cell_a': [uc['a'] for uc in data_lists['unit_cells']],
    'cell_b': [uc['b'] for uc in data_lists['unit_cells']],
    'cell_c': [uc['c'] for uc in data_lists['unit_cells']],
    'cell_alpha': [uc['alpha'] for uc in data_lists['unit_cells']],
    'cell_beta': [uc['beta'] for uc in data_lists['unit_cells']],
    'cell_gamma': [uc['gamma'] for uc in data_lists['unit_cells']],
    'cell_volume': [uc['volume'] for uc in data_lists['unit_cells']],
    'n_atoms': [uc['n_atoms'] for uc in data_lists['unit_cells']]
})

# Save DataFrame
#df.to_csv(base_path / 'metadata.csv', index=False)

# Create DataFrame of I_high values
#I_high_df = pd.DataFrame(data_lists['I_high'])
#I_high_df.to_csv(base_path / 'I_high.csv', index=False)

# Print basic statistics
print("\nDataset Statistics:")
print(f"Total number of structures: {len(df)}")
print("\nGroup distribution:")
print(df['group'].value_counts())
print("\nShape statistics:")
print(df[['I_high_shape', 'ind_low_shape', 'ind_high_shape']].describe())


Dataset Statistics:
Total number of structures: 100000

Group distribution:
group
P 1 21 1 (No. 4)    50027
C 1 2 1 (No. 5)     49973
Name: count, dtype: int64

Shape statistics:
        I_high_shape  ind_low_shape  ind_high_shape
count  100000.000000  100000.000000   100000.000000
mean     1522.415590     242.101230     1522.415590
std       484.613598      76.394441      484.613598
min       617.000000     103.000000      617.000000
25%      1117.000000     180.000000     1117.000000
50%      1498.000000     237.000000     1498.000000
75%      1885.000000     298.000000     1885.000000
max      2755.000000     441.000000     2755.000000


In [4]:
import os

# Create directories for real CSD data
os.makedirs('clin_test/data', exist_ok=True)


In [5]:
len(data_lists['ind_low'][1])

190

In [6]:
for i in range(len(data_lists['I_high'])):
    np.savez_compressed(f'clin_test/data/{i}', Intensity = data_lists['I_high'][i], Ind_high = data_lists['ind_high'][i],
                        Ind_low = data_lists['ind_low'][i])


In [8]:
df_save = pd.DataFrame()
filenames = [f'{i}.npz' for i in range(len(data_lists['I_high']))]
df_save['filename'] = filenames
df_save['group'] = data_lists['groups']
df_save['unit_cell'] = data_lists['unit_cells']
df_save

Unnamed: 0,filename,group,unit_cell
0,0.npz,C 1 2 1 (No. 5),"{'a': 7.636789330940987, 'b': 9.92782613022328..."
1,1.npz,C 1 2 1 (No. 5),"{'a': 8.023005134210308, 'b': 10.4299066744734..."
2,2.npz,P 1 21 1 (No. 4),"{'a': 5.507813484310308, 'b': 7.1601575296034,..."
3,3.npz,P 1 21 1 (No. 4),"{'a': 6.7303329632396425, 'b': 8.7494328522115..."
4,4.npz,P 1 21 1 (No. 4),"{'a': 6.115384004094226, 'b': 7.94999920532249..."
...,...,...,...
99995,99995.npz,C 1 2 1 (No. 5),"{'a': 8.994640050585636, 'b': 11.6930320657613..."
99996,99996.npz,C 1 2 1 (No. 5),"{'a': 8.312948544845717, 'b': 10.8068331082994..."
99997,99997.npz,P 1 21 1 (No. 4),"{'a': 6.617236832795079, 'b': 8.60240788263360..."
99998,99998.npz,P 1 21 1 (No. 4),"{'a': 6.099421320005669, 'b': 7.92924771600736..."


In [9]:
for i in range(len(data_lists['groups'])):
    assert df_save['group'][i] == data_lists['groups'][i]
df_save.to_csv('clin_test/test.csv', index=False)
