# converting signals into images, bin files(.npz)

このノートブックではディレクトリを指定して、その中にある生波形データ(.mat)を、一括して画像(.png)及びその値（.npz）に変換し保存するという処理を行っています。

### 初期セットアップ
パスの関係を以下に定義します。

In [2]:
import os 
import yaml
import numpy as np
# Load configuration from YAML file
with open("config/config.yaml", 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

visualize_dir = config['visualize']['dir']

base_dir_sim = config['simulation']['base_data_dir']
processed_dir_sim = os.path.join(base_dir_sim, "processed")
dataset_dir = os.path.join(base_dir_sim, "dataset")

base_dir_exp = config['experiment']['input']['base_data_dir']
rawsignal_dir_exp = os.path.join(base_dir_exp, "rawsignal")
rawsignal_dir_exp_solid_liquid = os.path.join(rawsignal_dir_exp, "solid_liquid")
processed_dir_exp_all = os.path.join(config['experiment']['input']['base_data_dir'], "processed", "all")

## シミュレーションデータ変換  
 シミュレーションで生成した`.mat`のファイルを統一形式である`.npz`に変換します。以下は、フォルダを指定するとその配下のファイルをすべて一括で変換し、`/processed`ディレクトリに保存するという処理を行うものです。

In [3]:
from src import mat2npz_sim,npz2png

import glob

case_name = "case1"


def convert_all_simulation_mat_to_npz(case_name, base_data_dir):
    """
    Convert all simulation .mat files in the specified case directory to .npz format.

    Parameters
    ----------
    case_name : str
        The name of the simulation case (e.g., "case5").
    base_data_dir : str
        The base directory where simulation data is stored.
    """
    # Define input directory for raw simulation signals (relative to base_data_dir and case_name)
    mat_dir = os.path.join(base_data_dir, f"rawsignal/{case_name}/data")

    # Define config file path (relative to base_data_dir and case_name)
    config_path = os.path.join(base_data_dir, f"rawsignal/{case_name}/config.json")

    # Define output directory for processed files (relative to base_data_dir and case_name)
    output_dir = os.path.join(base_data_dir, f"processed/{case_name}")
    # Create the output directory if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")
    # Save a copy of the config.json file to the output directory for reference
    import shutil
    config_copy_path = os.path.join(output_dir, "config.json")
    shutil.copy2(config_path, config_copy_path)
    print(f"Copied config.json to: {config_copy_path}")

    # Create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)
    mat_files_list = glob.glob(os.path.join(mat_dir, "*.mat"))
    for mat_file in mat_files_list:
        print(f"Processing: {mat_file}")
        mat2npz_sim(mat_file, config_path, output_dir)

# 関数の呼び出し例
convert_all_simulation_mat_to_npz(case_name, base_dir_sim)


npz2png(file_path=os.path.join(processed_dir_sim, "case26/solid_liquid_reflector3_processed.npz"),save_path=visualize_dir,full=False,pulse_index=0)
npz2png(file_path=os.path.join(processed_dir_sim, "case9/solid_liquid_reflector3_processed.npz"),save_path=visualize_dir,full=True,pulse_index=0)

Copied config.json to: /home/smatsubara/documents/airlift/data/simulation/processed/case1/config.json
Processing: /home/smatsubara/documents/airlift/data/simulation/rawsignal/case1/data/solid_liquid_reflector1.mat
<KeysViewHDF5 ['#refs#', '#subsystem#', 'kgrid', 'sensor_data']>
['Nt', 'Nx', 'Ny', 'Nz', 'dim', 'dt', 'dx', 'dxudxn', 'dxudxn_sgx', 'dy', 'dyudyn', 'dyudyn_sgy', 'dz', 'dzudzn', 'dzudzn_sgz', 'k', 'k_max', 'kx_max', 'kx_vec', 'ky_max', 'ky_vec', 'kz_max', 'kz_vec', 'nonuniform', 'xn_vec', 'xn_vec_sgx', 'yn_vec', 'yn_vec_sgy', 'zn_vec', 'zn_vec_sgz']
999999999.9999999
keys: ['#refs#', '#subsystem#', 'kgrid', 'sensor_data']
['#refs#', '#subsystem#', 'kgrid', 'sensor_data']
(50000,)
Processed data and metadata saved to: /home/smatsubara/documents/airlift/data/simulation/processed/case1/solid_liquid_reflector1_processed.npz
Processing: /home/smatsubara/documents/airlift/data/simulation/rawsignal/case1/data/solid_liquid_reflector10.mat
<KeysViewHDF5 ['#refs#', '#subsystem#', 'kgr

## 機械学習用データセット生成（シミュレーション）
　次に、変換した`.npz`のファイルに対応する目標変数となる値を`/config.json`を使って計算し、データセットとなる`x_train.npy`,`t_train.npy`を作成していきます。これらのiDの紐づけが狂うとすべての計算の意味がなくなってしまうので、最大限注意してください。  
また、実機への展開をスムーズにするために、最大値を用いてスケーリングしていることに注意して下さい。その他順序付けなど筆者は細心の注意を払って実装していますが、もし誤りがあればご指摘いただけると幸いです。


In [4]:
from src import calculate_gvf_and_signal,npz2png,process_case_and_return_dataset



x_list = []
t_list = []

# Get all case directories (e.g., case5, case6, ...)
case_dirs = sorted([d for d in os.listdir(processed_dir_sim) if os.path.isdir(os.path.join(processed_dir_sim, d)) and d.startswith("case")])

x_train_list = []
t_train_list = []

for case_name in case_dirs:
    base_dir = os.path.join(processed_dir_sim, case_name)
    print(f"Processing {case_name} in {base_dir}")
    x_tmp, t_tmp = process_case_and_return_dataset(case_name, base_dir)
    print(f"x_tmp shape: {x_tmp.shape}, t_tmp shape: {t_tmp.shape}")
    x_train_list.append(x_tmp)
    t_train_list.append(t_tmp)
print("list done")
# Concatenate all cases into single arrays
x_train = np.concatenate(x_train_list, axis=0)
t_train = np.concatenate(t_train_list, axis=0)
x_train = x_train/np.max(x_train)
x_train = np.log1p(x_train)
# x_train = x_train/np.max(x_train)
print("Final x_train shape:", x_train.shape)
print("Final t_train shape:", t_train.shape)
print(np.max(x_train),np.min(x_train))
np.save(os.path.join(dataset_dir, "x_train.npy"), x_train)
np.save(os.path.join(dataset_dir, "t_train.npy"), t_train)
#print(np.max(x_train))
#print(t_train)
#print(x_train)
npz_file_path = os.path.join(processed_dir_sim, "case5/solid_liquid4_processed.npz")
output_folder_path = config['output']['results_other_dir']
npz2png(npz_file_path, output_folder_path, channel_index=0, start_time=0.0, end_time=None, full=True, pulse_index=0)
npz2png(npz_file_path, output_folder_path, channel_index=0, start_time=0.0, end_time=None, full=False, pulse_index=0)


Processing case1 in /home/smatsubara/documents/airlift/data/simulation/processed/case1
['/home/smatsubara/documents/airlift/data/simulation/processed/case1/solid_liquid_reflector10_processed.npz', '/home/smatsubara/documents/airlift/data/simulation/processed/case1/solid_liquid_reflector1_processed.npz', '/home/smatsubara/documents/airlift/data/simulation/processed/case1/solid_liquid_reflector2_processed.npz', '/home/smatsubara/documents/airlift/data/simulation/processed/case1/solid_liquid_reflector3_processed.npz']
22233.767230882375 0.030785031114614694
x_tmp shape: (4, 2500), t_tmp shape: (4,)
Processing case10 in /home/smatsubara/documents/airlift/data/simulation/processed/case10
['/home/smatsubara/documents/airlift/data/simulation/processed/case10/solid_liquid_reflector10_processed.npz', '/home/smatsubara/documents/airlift/data/simulation/processed/case10/solid_liquid_reflector1_processed.npz', '/home/smatsubara/documents/airlift/data/simulation/processed/case10/solid_liquid_reflec

## チェック
ケース毎の固相体積率を計算するコードです。

In [5]:
from src import calculate_gvf_and_signal
import yaml
import os

# Load configuration from YAML file
with open("config/config.yaml", 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

config_path = os.path.join(processed_dir_sim, "case30/config.json")
npz_path = os.path.join(processed_dir_sim, "case4/solid_liquid4_processed.npz")
input_tmp, target_tmp = calculate_gvf_and_signal(config_path, npz_path)
print(input_tmp.shape,target_tmp)

(2500,) 0.17121411352180582


## 実機データ変換  
実機データの`.mat`のファイルも、先ほどと同様に統一形式である`.npz`に変換します。同じく、`experiments/processed`に保存されるようにしています。下のものは、サンプルを一つだけ変換するものです。

In [7]:
from src import mat2npz_exp,npz2png

# Load configuration from YAML file
with open("config/config.yaml", 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

file_path = os.path.join(config['experiment']['input']['base_data_dir'],"rawsignal/solid_liquid/P20241007-1419.mat")

mat2npz_exp(
        file_path=file_path,
        output_dir=processed_dir_exp_all,
        start_time=0.1,  #初期の信号は不安定であることが多いため除外
        duration=5.0,
        amplitude_threshold=2,
        window_width=0.1e-3,
        signal_key="TDX1"
    )
npz2png(file_path=os.path.join(config['experiment']['input']['base_data_dir'], "processed", "all", "P20240726-1601_processed.npz"),save_path=processed_dir_exp_all,full=False,pulse_index=0,channel_index=1)

Loading data...
Loading successful
Using device: cuda:1
Number of detected triggers: (14700,)
arranged_pulses.shape: (14700, 5208, 4)
convert_exp finished
max: inf
processed_data.shape: (14000, 2500, 4)
max: inf
processed_data[0,:,0].shape: (2500,)
max: 1.091976523399353
argmax: 326
maxes argmax: 3,max: 3.4028234663852886e+38
(14000, 1, 4) 0.06849315 3.4028235e+38
scaled: ((14000, 2500, 4), -3.4028235e+38, 3.4028235e+38)
max_value: 3.4028234663852886e+38
['__header__', '__version__', '__globals__', 'Tstart', 'Tinterval', 'ExtraSamples', 'RequestedLength', 'Length', 'Version', 'TDX1', 'TDX2', 'TDX3', 'TDX1_enlarged']
signal points: (2500,)
Processed data and metadata saved to: /home/smatsubara/documents/airlift/data/experiments/processed/all/P20241007-1419_processed.npz
/home/smatsubara/documents/airlift/data/experiments/processed/all/P20240726-1601_processed_1pulse0.png


## 複数を一括で変換するスクリプト

In [None]:
from src import mat2npz_exp,npz2png
import os
import glob
import yaml

# Load configuration from YAML file
with open("config/config.yaml", 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

# Get all .mat files in the rawsignal directory

mat_files = glob.glob(os.path.join(rawsignal_dir_exp_solid_liquid, "*.mat"))


for file_path in mat_files:
    print(f"Processing {file_path}")
    mat2npz_exp(
        file_path=file_path,
        output_dir=processed_dir_exp_all,
        start_time=0.1,  #初期の信号は不安定であることが多いため除外
        duration=5.0,
        amplitude_threshold=2,
        window_width=0.1e-3,
        signal_key="TDX1"
    )


## 機械学習用データセット生成(実機)  
次に、変換した`.npz`のファイルに対応する目標変数となる値を`/target_variables.csv`を使って参照し、データセット`x_test.npy` `t_test.npy`を作成していきます。
なお、ここではただ入力と出力のペアを作成しただけであり、外れ値除去などは実装していません。それらは、機械学習用レポジトリ`https://github.com/apetrasc/ml_airlift`に記載します。

In [None]:
import polars as pl
import yaml
from scipy.signal import resample_poly
with open("config/config.yaml", 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)
df =pl.read_csv(config['experiment']['target'],encoding="shift_jis")


# "ガラス球直径"列の右隣に、"P2024{日付}{時分}_processed.npz"という名前列を追加
# コピーを作成し、result_dirに保存

# まず既存のdfをコピー
df_copy = df.clone()

# 新しいカラム名
new_col_name = "ファイル名"

# "P2024" + 日付 + 時分 + "_processed.npz"を作成
df_copy = df_copy.with_columns(
    (pl.lit("P") + df_copy["日付"].cast(pl.Utf8) + "-" + df_copy["時分"].cast(pl.Utf8) + "_processed.npz").alias(new_col_name)
)

# "ガラス球直径"列のすぐ右隣りに挿入
glass_col_idx = df_copy.columns.index("ガラス球直径")
cols = df_copy.columns.copy()
cols.insert(glass_col_idx + 1, cols.pop(-1))  # 新しいカラム(最後にある)を目的位置へ

df_final = df_copy.select(cols)

# 保存
result_dir = config['output']['results_dir']
save_path = result_dir.rstrip("/") + "/target_valiables_with_filename.csv"
df_final.write_csv(save_path)

print(f"saved: {save_path}")

# 再読込（または前の df_final を使用）
df_work = df_final.clone()

# "FullPath" を作成
fullpath_col = "FullPath"
df_work = df_work.with_columns(
    (pl.lit(processed_dir_exp_all.rstrip("/") + "/") + df_work["ファイル名"]).alias(fullpath_col)
)

# "ガラス球直径" のインデックス取得し、2つ右隣に "FullPath" を移動
glass_idx = df_work.columns.index("ガラス球直径")
# "ファイル名"は1つ右、その隣=2つ右になるよう "FullPath" を移動
cols = df_work.columns.copy()
# 一旦 "FullPath" をpop
cols.pop(cols.index(fullpath_col))
cols.insert(glass_idx + 2, fullpath_col)
df_final2 = df_work.select(cols)

# 保存
save_path2 = result_dir.rstrip("/") + "/target_valiables_with_fullpath.csv"
df_final2.write_csv(save_path2)
print(f"saved: {save_path2}")

df_final2.head()



# Load configuration from YAML file
with open("config/config.yaml", 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

df = pl.read_csv(os.path.join(config['output']['results_dir'], 'target_valiables_with_fullpath.csv'), encoding='utf-8')
print(f"CSV rows: {len(df)}")

# CSVファイルの行数だけ処理する
file_paths = df_final2["FullPath"].to_list()
print(f"Total CSV rows to process: {len(file_paths)}")

processed_data_list = []
targets_list = []
processed_dir_exp = os.path.join(config['experiment']['input']['base_data_dir'], "processed")

for i, p in enumerate(file_paths):
    # ファイルの存在確認
    if not os.path.exists(p):
        print(f"[SKIP {i+1}/{len(file_paths)}] File not found: {p}")
        # 両方のリストにNoneを追加して同期を保つ
        processed_data_list.append(None)
        targets_list.append(None)
        continue
    
    # パスが正しいディレクトリ内にあるか確認
    if not p.startswith(processed_dir_exp):
        print(f"[SKIP {i+1}/{len(file_paths)}] Path not in expected directory: {p}")
        processed_data_list.append(None)
        targets_list.append(None)
        continue
    
    try:
        data_npz = np.load(p)
    except Exception as e:
        print(f"[ERROR {i+1}/{len(file_paths)}] Failed to load: {p} ({e})")
        processed_data_list.append(None)
        targets_list.append(None)
        continue

    if 'processed_data' not in data_npz:
        print(f"[SKIP {i+1}/{len(file_paths)}] 'processed_data' key not found in: {p}")
        processed_data_list.append(None)
        targets_list.append(None)
        continue

    arr = data_npz['processed_data']
    print(f"[{i+1}/{len(file_paths)}] {p}: original shape: {arr.shape}")
    
    # arrは(14000, 2500, 4)のshapeになっている想定
    # 14000 (時間) を 1/10 の 1400 にダウンサンプリングしたい（時間軸はaxis=0）
    target_length = 1400
    original_length = arr.shape[0]
    up = target_length
    down = original_length

    if arr.shape[0] != target_length:
        # arr の shape: (T, W, C) -> 軸入れ替えず、そのまま axis=0 でダウンサンプリング
        # 例えば (14000, 2500, 4) → (1400, 2500, 4)
        arr_ds = resample_poly(arr, up, down, axis=0)
        print(f"    downsampled shape: {arr_ds.shape}")
        arr = arr_ds
    else:
        print("    no downsampling necessary.")

    # 軸が (T, W, C)=(1400, 2500, 4) なので (C, T, W)に変形
    if arr.ndim == 3:
        arr_T = np.transpose(arr, (2, 0, 1))
        print(f"    transposed shape: {arr_T.shape}")
        
        # ターゲット変数を先に取得（エラーが発生した場合に備えて）
        try:
            row = df_final2.row(i)
            targets = np.array([
                float(row[df_final2.columns.index("固相見かけ流速")]),
                float(row[df_final2.columns.index("気相見かけ流速")]),
                float(row[df_final2.columns.index("液相見かけ流速")]),
                float(row[df_final2.columns.index("固相体積率")]),
                float(row[df_final2.columns.index("気相体積率")]),
                float(row[df_final2.columns.index("液相体積率")])
            ])
            # 両方のリストに同時に追加（同期を保つ）
            processed_data_list.append(arr_T)
            targets_list.append(targets)
        except Exception as e:
            print(f"    [ERROR] Failed to load targets for row {i}: {e}")
            # エラー時も両方にNoneを追加して同期を保つ
            processed_data_list.append(None)
            targets_list.append(None)
            continue
    else:
        print(f"    [SKIP] processed_data is not 3D (got shape {arr.shape}), skipping")
        processed_data_list.append(None)
        targets_list.append(None)

# Noneを取り除き、対応するインデックスで処理（同期を保つ）
valid_indices = [i for i, data in enumerate(processed_data_list) if data is not None]
processed_data_valids = [processed_data_list[i] for i in valid_indices]
targets_valids = [targets_list[i] for i in valid_indices]

print(f"\n[SUMMARY]")
print(f"Total CSV rows: {len(file_paths)}")
print(f"Successfully processed: {len(processed_data_valids)}")
print(f"Failed/Skipped: {len(file_paths) - len(processed_data_valids)}")
print(f"processed_data_list length: {len(processed_data_list)}")
print(f"targets_list length: {len(targets_list)}")
print(f"Valid data count: {len(processed_data_valids)}")
print(f"Valid targets count: {len(targets_valids)}")

if len(processed_data_valids) == 0:
    print("No valid data could be loaded. combined_data will be None")
    combined_data = None
else:
    combined_data = np.stack(processed_data_valids, axis=0)
    print(f"Final shape: {combined_data.shape}")

x_train_real = combined_data
t_train_real = np.stack(targets_valids, axis=0)

print(f"\nx_train_real shape: {x_train_real.shape}")
print(f"t_train_real shape: {t_train_real.shape}")

# 形状の一致を確認
if x_train_real.shape[0] != t_train_real.shape[0]:
    print(f"\n[WARNING] Shape mismatch detected!")
    print(f"  x_train_real samples: {x_train_real.shape[0]}")
    print(f"  t_train_real samples: {t_train_real.shape[0]}")
    print(f"  Difference: {x_train_real.shape[0] - t_train_real.shape[0]}")
else:
    print(f"\n[OK] Shapes match: {x_train_real.shape[0]} samples")



In [10]:
dataset_dir_exp = os.path.join(config['experiment']['input']['base_data_dir'], "dataset")
print(f"x_train_real shape: {x_train_real.shape}")
print(f"t_train_real shape: {t_train_real.shape}")
np.save(os.path.join(dataset_dir_exp, "x_train_real.npy"), x_train_real)
np.save(os.path.join(dataset_dir_exp, "t_train_real.npy"), t_train_real)

x_train_real shape: (337, 4, 1400, 2500)
t_train_real shape: (337, 6)
