In [None]:
import os
import pandas as pd
import numpy as np
import gzip
import pickle
import pydicom

In [None]:
# INTERNAL USE: Load pkl into a single dataframe
def get_load_pkl(folder_path):
    dfs = []
    for subfolder in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder)
        if os.path.isdir(subfolder_path):
            file_path = os.path.join(subfolder_path, f"{subfolder}.pkl.gz")
            if os.path.exists(file_path):
                with gzip.open(file_path, "rb") as f:
                    data = pickle.load(f)
                if isinstance(data, dict) or isinstance(data, list):
                    dfs.append(pd.DataFrame(data))

    return pd.concat(dfs, ignore_index=True)

In [None]:
# INTERNAL USE: Replace DICOM tag numbers with labels
def get_tag_name(tag):
    if isinstance(tag, pydicom.tag.BaseTag):
        try:
            keyword = pydicom.datadict.keyword_for_tag(tag)
            return keyword if keyword else str(tag)
        except:
            return str(tag)
    else:
        return tag

In [None]:
# INTERNAL USE: Convert DICOM value to standard data types
def convert_dcm_value(x):
    if isinstance(x, pydicom.dataelem.DataElement):
        value = x.value
    else:
        value = x

    if isinstance(value, pydicom.multival.MultiValue):
        return tuple(float(v) if isinstance(v, pydicom.valuerep.DSfloat) else v for v in value)
    elif isinstance(value, pydicom.valuerep.DSfloat):
        return float(value)
    else:
        return value

In [None]:
# INTERNAL USE: Clean data types
def clean_dcm_column(colname, series):
    if colname in {"ScanningSequence", "SequenceVariant"}:
        return series.apply(lambda x: (x,) if not isinstance(x, tuple) else x)
    elif colname == "ImageType":
        return series.apply(lambda x: set(x) if isinstance(x, (list, tuple, str)) else set())
    elif colname == "InversionTime":
        series = series.apply(lambda x: float(x) if isinstance(x, pydicom.valuerep.DSfloat) else x)
        return pd.to_numeric(series, errors="coerce")
    return series

In [None]:
# INTERNAL USE: Remove potential report images
def remove_report_images(df):
    report_keywords = ["mr_brain", "screen", "docs", "document", "paperwork", "pdf", "report"]
    report_manufacturers = ["PACS", "LEXMARK", "Hyland", "FUJITSU", "Viztek", "Canon", "Sorna", "Altamont", "INTELETAD", "Carestream"]

    report_mask = (
        (df["Sequence"].str.contains("|".join(report_keywords), case=False, na=False)) |
        (df["Manufacturer"].str.contains("|".join(report_manufacturers), case=False, na=False)) |
        (df["ImageType"].apply(
            lambda items: any(any(kw in str(item).lower() for kw in ["screen", "scan", "doc"])
                              for item in items)
        )) |
        (
            df["ImageType"].apply(lambda items: any("secondary" in str(item).lower() for item in items)) &
            df["Manufacturer"].isna() &
            (df["Modality"] == "MR")
        )
    )

    return df[~report_mask]

In [None]:
# INTERNAL USE: Clean sequence column
def clean_sequence_column(df):
    df["Sequence"] = df["Sequence"].str.replace(r"^\d+_+", "", regex=True)
    df["Sequence"] = df["Sequence"].str.lower()

    return df

In [None]:
# Identify MRI sequence name based on DICOM metadata
def identify_mri_sequence(df):
    def to_set(x):
        if not isinstance(x, (list, tuple, set, np.ndarray)) and pd.isna(x):
            return set()
        elif isinstance(x, (list, tuple, set)):
            flat = []
            for item in x:
                if isinstance(item, (list, tuple, set)):
                    flat.extend(item)
                else:
                    flat.append(item)
            return set(map(str.upper, map(str, flat)))
        else:
            return set(map(str.upper, str(x).replace(" ", "").split("\\")))

    def safe_get(row, key):
        return row.get(key, np.nan)

    def classify(row):
        scan_seq_set = to_set(row["ScanningSequence"])
        seq_var_set = to_set(row["SequenceVariant"])
        img_type_set = to_set(row["ImageType"])

        te = safe_get(row, "EchoTime")
        tr = safe_get(row, "RepetitionTime")
        fa = safe_get(row, "FlipAngle")
        ti = safe_get(row, "InversionTime")
        bval = safe_get(row, "DiffusionBValue")
        dim = safe_get(row, "MRAcquisitionType")
        etl = safe_get(row, "EchoTrainLength")
        
        #  Identify from common labels in ImageType
        if any("ADC" in item.upper() for item in img_type_set):
            return "DWI ADC"
        if "TRACEW" in img_type_set:
            return "DWI TRACEW"
        if any("FA" in item.upper() and "FAT" not in item.upper() for item in img_type_set):
            return "DWI FA"
        if any("ISO" in item.upper() for item in img_type_set):
            return "DWI ISO"
        if "EXP" in img_type_set:
            return "DWI EXP"
        if any("DIFF" in item.upper() or "DWI" in item.upper() for item in img_type_set) or "DFC" in img_type_set or (bval and bval > 0):
            if bval and bval == 0:
                return "DWI b0"
            else:
                return "DWI"
        
        if any("ASL" in item.upper() for item in img_type_set):
            return "PWI ASL"
        if any("PERFUSION" in item.upper() for item in img_type_set) or "CBF" in img_type_set:
            return "PWI"
        
        if any("SW" in item.upper() for item in img_type_set):
            return "SWI"

        if "TOF" in img_type_set or "ANGIO" in img_type_set or any("MIP" in item.upper() for item in img_type_set):
            return "MRA TOF"
        
        if "FMRI" in img_type_set:
            return "fMRI"
        
        if any("SPECTR" in item.upper() for item in img_type_set):
            return "MRS"

        # Idenfify from pulse sequence
        if "EP" in scan_seq_set:
            if 25 <= te <= 50 and 1000 <= tr <= 3000 and etl >= 30:
                return "fMRI"
            elif te >= 40 and tr >= 1800:
                return "DWI"
            elif te < 60 and tr > 1000 and etl > 1:
                return "PWI"
            else:
                return "EPI Unknown"
        else:
            if "SE" in scan_seq_set or "RM" in scan_seq_set or ("GR" not in scan_seq_set and fa >= 90):
                if "IR" in scan_seq_set or (isinstance(ti, (int, float)) and not np.isnan(ti) and ti > 0):
                    if ti >= 1200 and tr >= 2000 and te >= 60:
                        return "T2 FLAIR"
                    elif 400 <= ti < 1800 and tr >= 450:
                        return "T1 FLAIR"
                    elif ti <= 400 and tr >= 1000:
                        return "STIR"
                    elif dim == "2D":
                        return "IR Unknown"
                else:
                    if te <= 25 and tr <= 1500:
                        return "T1 SE"
                    elif te >= 30 and tr >= 800:
                        return "T2 SE"
                    elif te <= 25 and tr > 1500:
                        return "PD SE"
                    else:
                        return "SE Unknown"
            
            elif "GR" in scan_seq_set:
                if dim == "3D" and "MP" in seq_var_set:
                    if 300 <= ti <= 3000 and 1000 <= tr <= 5000:
                        return "T1 MPRAGE"
                elif te < 10 and tr <= 500:
                    if "SP" in seq_var_set and "IR" in scan_seq_set or (isinstance(ti, (int, float)) and not np.isnan(ti) and ti > 0):
                        return "T1 SPGR"
                    elif 30 < fa <= 90 and tr < 15:
                        return "bSSFP"
                    elif 2.5 <= te <= 9 and 20 <= tr <= 40:
                        return "MRA TOF"
                    else:
                        return "T1 GRE"
                elif te >= 10 and 120 <= tr <= 1600:
                    return "T2* GRE"
                elif dim == "3D" and fa <= 40 and te < 55 and tr < 120:
                    return "SWI"

                else:
                    return "GRE Unknown"
            
        return "Unknown"

    df["IdentifiedSequence"] = df.apply(classify, axis=1)
    df.loc[(df["SeriesNumber"] == 1) & (df["IdentifiedSequence"].str.contains("Unknown")), "IdentifiedSequence"] = "Localizer"
    df.loc[df["Sequence"].str.contains(r"loc|scout|aahead|cal|scano|3_pl|3pl|survey", case=False, na=False), "IdentifiedSequence"] = "Localizer"

    return df

In [None]:
# INTERNAL USE: Map identified sequence to BIDS standard
sequence_to_bids = {
    "T1 SE": "T1w",
    "T1 MPRAGE": "T1w",
    "T1 SPGR": "T1w",
    "T1 GRE": "T1w",
    "T1 FLAIR": "T1w",
    "T2 SE": "T2w",
    "bSSFP": "T2w",
    "STIR": "T2w",
    "T2* GRE": "T2starw",
    "T2 FLAIR": "FLAIR",
    "PD SE": "PDw",
    "SWI": "swi",
    "DWI": "dwi",
    "DWI ADC": "dwi",
    "DWI TRACEW": "dwi",
    "DWI FA": "dwi",
    "DWI EXP": "dwi",
    "DWI ISO": "dwi",
    "MRA TOF": "angio",
    "PWI": "asl",
    "PWI ASL": "asl",
    "fMRI": "bold",
    "MRS": "mrs",
    "Localizer": "OTHER localizer",
    "EPI Unknown": "OTHER unknown",
    "IR Unknown": "OTHER unknown",
    "SE Unknown": "OTHER unknown",
    "GRE Unknown": "OTHER unknown",
    "Unknown": "OTHER unknown"
}

In [None]:
# INTERNAL USE: Example usage of mapping identified sequence to BIDS standard
mri_mgb["BIDSModality"] = mri_mgb["IdentifiedSequence"].map(sequence_to_bids).fillna("OTHER")