# Saves metadata

Checks that the mat files have at least 19 channels and that gender, age and cpr is not missing. For the files that meet these requirements, the subject id is appened to a list and the metadata for this subject is saved to a csv file called valid_metadata.csv

In [None]:
import os
import scipy.io
import numpy as np
import pandas as pd

In [None]:
# All the mat files 
data_dir = 'E:\ChristianMusaeus\Data\eAsMat_1968-14999'
output_csv = 'valid_metadata.csv'

# Functions
def load_mat_file(filepath):
    """Load a .mat file """
    return scipy.io.loadmat(filepath, struct_as_record=False, squeeze_me=True)

def get_num_channels(mat_data):
    """Extract the number of EEG channels from EEG.chanlocs"""
    eeg = mat_data['EEG']
    if isinstance(eeg, np.ndarray):
        eeg = eeg.item()
    chanlocs = getattr(eeg, 'chanlocs', None)
    return len(chanlocs) if chanlocs is not None else 0

def extract_metadata(mat_data):
    """Extract age, sex, and CPR from EEG.patientID"""
    eeg = mat_data['EEG']
    if isinstance(eeg, np.ndarray):
        eeg = eeg.item()
    try:
        patient = eeg.patientID
        age = getattr(patient, 'age', None)
        sex = getattr(patient, 'sex', None)
        cpr = getattr(patient, 'CPR', None)
        if age is not None and sex not in (None, '', 'NA'):
            return age, sex, cpr
    except Exception:
        pass
    return None, None, None

# Main Loop 
records = []
mat_files = [f for f in os.listdir(data_dir) if f.endswith('.mat')]
total_files = len(mat_files)

for i, file in enumerate(mat_files, start=1):
    print(f"[{i}/{total_files}] Processing file {file}")
    filepath = os.path.join(data_dir, file)
    try:
        mat = load_mat_file(filepath)
        n_channels = get_num_channels(mat)
        if n_channels >= 19:
            age, sex, cpr = extract_metadata(mat)
            if age is not None and sex is not None:
                subject_id = os.path.splitext(file)[0]
                records.append({
                    'subject_id': subject_id,
                    'age': age,
                    'sex': sex,
                    'cpr': cpr
                })
    except Exception as e:
        print(f"Skipping {file}: {e}")

# Save to CSV
df = pd.DataFrame(records)
df.to_csv(output_csv, index=False)
print(f"\n Done. Saved {len(df)} valid entries to {output_csv}")


In [None]:
df = pd.read_csv('valid_metadata.csv')
print(f" Number of valid files: {len(df)}")

This saves 8292 files. The length of the files are then checked. If the file is between 10 minutes and 60 minutes, the subject id is appended to the list valid_time_files. 

In [None]:

data_dir = 'E:\ChristianMusaeus\Data\eAsMat_1968-14999'
metadata_csv = 'valid_metadata.csv'

def load_mat_file(filepath):
    return scipy.io.loadmat(filepath, struct_as_record=False, squeeze_me=True)

# Load Valid Metadata
df = pd.read_csv(metadata_csv)

# Initialize List
valid_time_files = []

# Loop Through Each File
total_files = len(df)

for i, row in df.iterrows():
    subject_id = row['subject_id']
    file_path = os.path.join(data_dir, f"{subject_id}.mat")
    print(f"[{i + 1}/{total_files}] Processing {subject_id}.mat...")

    try:
        mat = load_mat_file(file_path)
        eeg = mat['EEG']
        if isinstance(eeg, np.ndarray):
            eeg = eeg.item()

        data = getattr(eeg, 'data', None)
        sr = getattr(eeg, 'sr', None)  # Use 'sr' field for sampling rate

        if isinstance(data, np.ndarray) and sr is not None:
            if data.ndim == 2:
                n_samples = data.shape[1]
                duration_sec = n_samples / sr
                duration_min = duration_sec / 60
                print(f"   → Duration: {duration_min:.2f} minutes")

                if 10 <= duration_min <= 60:
                    valid_time_files.append(subject_id)
            else:
                print(f"   ⚠ Unexpected data shape: {data.shape}")
        else:
            print(f"   ⚠ Missing or invalid 'data' or 'sr'")
    except Exception as e:
        print(f" Error reading {subject_id}.mat: {e}")

print(f"\n Done. Files between 10 and 60 minutes: {len(valid_time_files)}")


5778 files meet the time requirements. The metadata for these 5778 subjects is then added to a new csv file called metadata_time_filtered.csv

In [None]:
metadata_df = pd.read_csv('valid_metadata.csv')
valid_time_set = set(valid_time_files)

filtered_df = metadata_df[metadata_df['subject_id'].isin(valid_time_set)]
filtered_df.to_csv('metadata_time_filtered.csv', index=False)

print(f"Done. Saved {len(filtered_df)} entries to metadata_time_filtered.csv")