In [1]:
import os
import pandas as pd
import numpy as np
from fairseq_signals.utils.store import MemmapReader

fairseq_signals_root = '/home/aa2650/playground/fairseq-signals'




In [None]:
# Filters record.csv
# Load the dataset (replace with your actual file name)
df = pd.read_csv('/home/aa2650/datasets/code_15/records.csv')

# Conditions to preserve specific classes
conditions = (df['RBBB'] | df['LBBB'] | df['SB'] | df['ST'] | df['AF'])

# Entries belonging to specific classes
df_classes = df[conditions]

# Entries labeled normal_ecg but not belonging to specified classes
df_normal_only = df[(df['normal_ecg']) & (~conditions)]

# Limit normal_ecg entries to 9500 samples (you can adjust the sampling method if needed)
df_normal_sampled = df_normal_only.sample(n=min(9500, len(df_normal_only)), random_state=42)

# Combine both datasets
filtered_df = pd.concat([df_classes, df_normal_sampled], ignore_index=True)

# Save filtered data
filtered_df.to_csv('/home/aa2650/datasets/code_15/subset/records.csv', index=False)

print(f"Filtered dataset contains {len(filtered_df)} entries.")


Filtered dataset contains 43079 entries.


In [17]:
# Filter meta.csv

meta_df = pd.read_csv('/home/aa2650/datasets/code_15/inframundo/meta.csv')

# Conditions to preserve specific classes
conditions = (meta_df['RBBB'] | meta_df['LBBB'] | meta_df['SB'] | meta_df['ST'] | meta_df['AF'])

# Entries belonging to specific classes
meta_df_classes = meta_df[conditions]

# Entries labeled normal_ecg but not belonging to specified classes
meta_df_normal_only = meta_df[(meta_df['normal_ecg']) & (~conditions)]

# Limit normal_ecg entries to 9500 samples (you can adjust the sampling method if needed)
meta_df_normal_sampled = meta_df_normal_only.sample(n=min(9500, len(df_normal_only)), random_state=42)

# Combine both datasets
meta_filtered_df = pd.concat([meta_df_classes, meta_df_normal_sampled], ignore_index=True)

# Save filtered data
meta_filtered_df.to_csv('/home/aa2650/datasets/code_15/subset/meta.csv', index=False)

print(f"Filtered dataset contains {len(meta_filtered_df)} entries.")


Filtered dataset contains 43079 entries.


In [18]:
# Reset 'idx' to be a sequential index starting from 0
meta_filtered_df['idx'] = range(len(meta_filtered_df))

# Save to a new file
meta_filtered_df.to_csv('/home/aa2650/datasets/code_15/subset/meta.csv', index=False)
print("'idx' column reset successfully")

'idx' column reset successfully


In [None]:
# Load segmented.csv and meta_split.csv
segmented_df = pd.read_csv('/home/aa2650/datasets/code_15/inframundo/segmented.csv')
meta_split_df = pd.read_csv('/home/aa2650/datasets/code_15/subset/meta.csv')

# Keep only rows in segmented_df with save_file present in meta_split_df
filtered_segmented_df = segmented_df[segmented_df['save_file'].isin(meta_split_df['save_file'])]

# Save the filtered segmented dataset
filtered_segmented_df.to_csv('/home/aa2650/datasets/code_15/subset/segmented.csv', index=False)
print(f"Filtered segmented.csv contains {len(filtered_segmented_df)} entries.")

# NOW MANUALLY CHANGE segmented.csv save_file to:
# /home/aa2650/datasets/code_15/subset/segmented/

Filtered segmented.csv contains 58984 entries.


In [63]:
segmented_split_df = pd.read_csv('/home/aa2650/datasets/code_15/subset/segmented_split.csv')

segmented_split_df_test_only = segmented_split_df[segmented_split_df['split'] == 'test']

segmented_split_df_test_only.to_csv('/home/aa2650/datasets/code_15/subset/test_segmented_split.csv', index=False)

print(f"✅ Saved {len(segmented_split_df_test_only)} with split rows test")


✅ Saved 5873 with split rows test


In [36]:
# Ensure amount of samples maps is correct

conditions_to_count = ['RBBB', 'LBBB', 'SB', 'ST', 'AF', 'normal_ecg']
counts = []
total = len(filtered_df)

for cond in conditions_to_count:
    pos_count = filtered_df[cond].sum()
    pos_percent = pos_count / total
    counts.append((cond, pos_count, pos_percent))

# Create and save count DataFrame
count_df = pd.DataFrame(counts, columns=['name', 'pos_count_all', 'pos_percent_all'])
count_df.to_csv('/home/aa2650/datasets/code_15/subset/label_def.csv', index=False)
print(count_df)


         name  pos_count_all  pos_percent_all
0        RBBB           9672         0.224518
1        LBBB           6026         0.139883
2          SB           5605         0.130110
3          ST           7584         0.176049
4          AF           7033         0.163258
5  normal_ecg           9500         0.220525


In [24]:
labels_cmd = f"""
cd /home/aa2650/playground/fairseq-signals/scripts/preprocess/ecg

python code_15_labels.py \
    --processed_root "/home/aa2650/datasets/code_15/subset" \
    --labels_path "/home/aa2650/datasets/code_15/subset/labels.csv"
"""

os.system(labels_cmd)

0

In [None]:
# Load the CSV
csv_path = '/home/aa2650/datasets/code_15/subset/labels.csv'  # Replace with the correct path if needed
labels_df = pd.read_csv(csv_path)

# Drop the unwanted columns
labels_df_cleaned = labels_df.drop(columns=['is_male', '1dAVb'])

# Save to a new file (or overwrite if desired)
labels_df_cleaned.to_csv('/home/aa2650/datasets/code_15/subset/labels.csv', index=False)  # or csv_path to overwrite

print("Columns 'is_male' and '1dAVb' removed")


In [38]:
split_cmd = f"""
cd /home/aa2650/playground/fairseq-signals/scripts/preprocess

python splits.py \
    --strategy "random" \
    --processed_root "/home/aa2650/datasets/code_15/subset" \
    --meta_file "/home/aa2650/datasets/code_15/subset/meta.csv" \
    --segmented_file "/home/aa2650/datasets/code_15/subset/segmented.csv" \
    --fractions "0.80,0.10,0.10" \
    --split_labels "train,valid,test" \
    
"""

os.system(split_cmd)



0

In [39]:
generate_test_tsv = f"""
cd {fairseq_signals_root}/scripts/preprocess && \
python manifests.py \
    --split_file_paths "/home/aa2650/datasets/code_15/subset/segmented_split.csv" \
    --save_dir "/home/aa2650/datasets/code_15/subset/manifests"
"""
os.system(generate_test_tsv)


0

In [4]:
root = '/home/aa2650/playground/ECG-FM'
FAIRSEQ_SIGNALS_ROOT = '/home/aa2650/playground/fairseq-signals'
FAIRSEQ_SIGNALS_ROOT = FAIRSEQ_SIGNALS_ROOT.rstrip('/')

In [2]:
train = 80
validation = 10
test = 10
data_split = f"{train}-{validation}-{test}"

PRETRAINED_MODEL='/home/aa2650/playground/ECG-FM/ckpts/mimic_iv_ecg_physionet_pretrained.pt'
MANIFEST_DIR=f"/home/aa2650/datasets/code_15/subset/manifests"
LABEL_DIR="/home/aa2650/datasets/code_15/subset"
OUTPUT_DIR=f'/home/aa2650/playground/ECG-FM/experiments/subset/{data_split}/validation'
NUM_LABELS=6
# NUM_LABELS=$(($(wc -l < "$/home/aa2650/playground/ECG-FM/data/code_15/labels/label_def.csv") - 1))
# POS_WEIGHT=$(cat $LABEL_DIR/pos_weight.txt)

# checkpoint.save_dir
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [6]:
# build the “finetune” string exactly as before…
finetune_cmd = f"""export HYDRA_FULL_ERROR=1 && \
fairseq-hydra-train \
    task.data={MANIFEST_DIR} \
    model.model_path={PRETRAINED_MODEL} \
    model.num_labels=6 \
    optimization.lr=[1e-06] \
    optimization.max_epoch=100 \
    dataset.batch_size=128 \
    dataset.num_workers=5 \
    dataset.disable_validation=false \
    dataset.valid_subset="valid" \\
    distributed_training.distributed_world_size=1 \
    distributed_training.find_unused_parameters=True \
    checkpoint.save_dir={OUTPUT_DIR} \
    checkpoint.save_interval=1 \
    checkpoint.keep_last_epochs=0 \
    common.log_format=csv \
    common.memory_efficient_fp16=True \
    +task.label_file={LABEL_DIR}/subset_y.npy \
    --config-dir {FAIRSEQ_SIGNALS_ROOT}/examples/w2v_cmsc/config/finetuning/ecg_transformer \
    --config-name diagnosis
"""

# wrap with nohup → write both stdout+stderr into train.log
nohup_cmd = f"nohup bash -lc \"{finetune_cmd}\" > {OUTPUT_DIR}/train.log 2>&1 &"

# launch it
os.system(nohup_cmd)
print(f"Launched training under nohup → logs at {OUTPUT_DIR}/train.log")


Launched training under nohup → logs at /home/aa2650/playground/ECG-FM/experiments/subset/80-10-10/validation/train.log


In [54]:
csv_path = "/home/aa2650/datasets/code_15/subset/labels.csv" 
output_path = "/home/aa2650/datasets/code_15/subset/subset_y.npy"

df = pd.read_csv(csv_path)

# Extract only the label columns (drop 'idx')
label_columns = df.columns[1:]
labels_df = df[label_columns]

y_array = labels_df.astype(np.float64).to_numpy()
print(y_array)
np.save(output_path, y_array)

print(f"Saved y.npy with shape {y_array.shape} and dtype {y_array.dtype}")


[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]]
Saved y.npy with shape (43079, 6) and dtype float64


In [62]:
# Generate test labels

import pandas as pd

# Load CSV files
labels_df = pd.read_csv('/home/aa2650/datasets/code_15/subset/labels.csv')
meta_df = pd.read_csv('/home/aa2650/datasets/code_15/subset/meta_split.csv')

# Filter meta_split to keep only 'train' or 'valid' entries
train_valid_idxs = meta_df[meta_df['split'].isin(['test'])]['idx']

# Filter out rows from labels_df with those idxs
filtered_labels_df = labels_df[labels_df['idx'].isin(train_valid_idxs)]

# Save the filtered labels to a new file (or overwrite if needed)
filtered_labels_df.to_csv('/home/aa2650/datasets/code_15/subset/test_labels.csv', index=False)

print(f"✅ Removed {len(labels_df) - len(filtered_labels_df)} rows. Saved to 'labels_filtered.csv'.")
print(filtered_labels_df.shape)


✅ Removed 38771 rows. Saved to 'labels_filtered.csv'.
(4308, 7)


In [None]:
# import pandas as pd
# import scipy.io
# import os

# # Paths
# csv_path = '/home/aa2650/datasets/code_15/subset/segmented_split.csv'  # Update with the actual path if needed
# output_dir = '/home/aa2650/datasets/code_15/subset/segmented'

# # Load CSV
# df = pd.read_csv(csv_path)

# # Iterate through each row
# for index, row in df.iterrows():
#     mat_path = row['path']
#     new_idx = int(row['idx'])

#     # Load .mat file
#     try:
#         mat_data = scipy.io.loadmat(mat_path)
#     except Exception as e:
#         print(f"❌ Failed to load {mat_path}: {e}")
#         continue

#     # Update 'idx' value
#     mat_data['idx'] = [[new_idx]]

#     # Get the original filename
#     mat_filename = os.path.basename(mat_path)

#     # Build the output path
#     output_path = os.path.join(output_dir, mat_filename)

#     # Save updated .mat file
#     try:
#         scipy.io.savemat(output_path, mat_data)
#         print(f"✅ Saved updated .mat to: {output_path}")
#     except Exception as e:
#         print(f"❌ Failed to save {output_path}: {e}")


✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subset/segmented/code_15_1000182_0.mat
✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subset/segmented/code_15_1000220_0.mat
✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subset/segmented/code_15_1000229_0.mat
✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subset/segmented/code_15_1000229_1.mat
✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subset/segmented/code_15_1000253_0.mat
✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subset/segmented/code_15_1000263_0.mat
✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subset/segmented/code_15_1000270_0.mat
✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subset/segmented/code_15_1000270_1.mat
✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subset/segmented/code_15_1000285_0.mat
✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subset/segmented/code_15_1000285_1.mat
✅ Saved updated .mat to: /home/aa2650/datasets/code_15/subse

In [None]:
# Load both CSV files
labels_df = pd.read_csv("/home/aa2650/datasets/code_15/subset/test_labels.csv")
segments_df = pd.read_csv("/home/aa2650/datasets/code_15/subset/test_segmented_split.csv")

# Count how many times each idx appears in the segmented file
idx_counts = segments_df['idx'].value_counts()

# Duplicate rows in the labels file based on occurrences in the segmented file
expanded_labels = pd.DataFrame(columns=labels_df.columns)

for idx, count in idx_counts.items():
    matching_rows = labels_df[labels_df['idx'] == idx]
    if not matching_rows.empty:
        expanded_rows = pd.concat([matching_rows] * count, ignore_index=True)
        expanded_labels = pd.concat([expanded_labels, expanded_rows], ignore_index=True)

# Save the updated labels to a new CSV
expanded_labels.to_csv("/home/aa2650/datasets/code_15/subset/ground_truth_test_labels.csv", index=False)

print("ground_truth_test_labels.csv")
