In [1]:
import os
import pandas as pd
from pathlib import Path

In [2]:
import torch

In [3]:
from pathlib import Path
import pandas as pd

DATA_ROOT = Path("./images_dataset/manifest/CBIS-DDSM")

folders = [p for p in DATA_ROOT.iterdir() if p.is_dir()]
len(folders)


6671

In [4]:
import re

def parse_folder_name(folder_name):
    """
    Ex: Calc-Test_P_00038_LEFT_CC_1
    """
    pattern = (
        r'(?P<lesion>Calc|Mass)-'
        r'(?P<split>Training|Test)_'
        r'P_(?P<patient>\d+)_'
        r'(?P<side>LEFT|RIGHT)_'
        r'(?P<view>CC|MLO)'
        r'(?P<suffix>_\d+)?'
    )

    match = re.match(pattern, folder_name)
    if not match:
        return None

    return match.groupdict()


In [5]:
records = []

for folder in folders:
    info = parse_folder_name(folder.name)
    if info is None:
        continue

    records.append({
        "patient_id": f"P_{info['patient']}",
        "split": info["split"],
        "lesion_type": info["lesion"],
        "laterality": info["side"],
        "view": info["view"],
        "series_variant": info["suffix"] if info["suffix"] else "_0",
        "folder_path": folder
    })

fs_df = pd.DataFrame(records)
fs_df.head()


Unnamed: 0,patient_id,split,lesion_type,laterality,view,series_variant,folder_path
0,P_00038,Test,Calc,LEFT,CC,_0,images_dataset\manifest\CBIS-DDSM\Calc-Test_P_...
1,P_00038,Test,Calc,LEFT,CC,_1,images_dataset\manifest\CBIS-DDSM\Calc-Test_P_...
2,P_00038,Test,Calc,LEFT,MLO,_0,images_dataset\manifest\CBIS-DDSM\Calc-Test_P_...
3,P_00038,Test,Calc,LEFT,MLO,_1,images_dataset\manifest\CBIS-DDSM\Calc-Test_P_...
4,P_00038,Test,Calc,RIGHT,CC,_0,images_dataset\manifest\CBIS-DDSM\Calc-Test_P_...


In [6]:
from pathlib import Path

def list_dicom_files(folder: Path):
    return sorted([
        f for f in folder.glob("**/*")
        if f.is_file() and f.suffix.lower() in [".dcm", ""]
    ])

fs_df["dicom_files"] = fs_df["folder_path"].apply(list_dicom_files)
fs_df["num_dicoms"] = fs_df["dicom_files"].apply(len)

fs_df[["num_dicoms"]].describe()


Unnamed: 0,num_dicoms
count,6671.0
mean,1.534852
std,0.498821
min,1.0
25%,1.0
50%,2.0
75%,2.0
max,2.0


In [7]:
fs_df = fs_df[fs_df["num_dicoms"] > 0].reset_index(drop=True)

In [8]:
fs_df.shape
fs_df["lesion_type"].value_counts()


lesion_type
Calc    3383
Mass    3288
Name: count, dtype: int64

In [9]:
import pydicom
import numpy as np

import numpy as np

def load_dicom_image(dcm_path):
    dcm = pydicom.dcmread(dcm_path)
    img = dcm.pixel_array.astype(np.float32)

    # Inverter se necessário
    if getattr(dcm, "PhotometricInterpretation", "") == "MONOCHROME1":
        img = img.max() - img

    # Normalização robusta
    img -= img.min()
    img /= (img.max() + 1e-6)

    return img


In [10]:
def mammography_window(img, low=0.5, high=99.5):
    vmin, vmax = np.percentile(img, [low, high])
    img = np.clip(img, vmin, vmax)
    img = (img - vmin) / (vmax - vmin + 1e-6)
    return img


In [11]:
import torch
import torch.nn.functional as F

def preprocess_image(img):
    x = torch.tensor(img, dtype=torch.float32)  # <-- fallback seguro
    x = x.unsqueeze(0).unsqueeze(0)              # [1,1,H,W]
    x = F.interpolate(
        x,
        size=(224, 224),
        mode="bilinear",
        align_corners=False
    )
    x = x.repeat(1, 3, 1, 1)
    return x




In [12]:
import torch
import torchvision.models as models
import torchvision.transforms as T

device = "cuda" if torch.cuda.is_available() else "cpu"

model = models.resnet50(weights="IMAGENET1K_V2")
model.fc = torch.nn.Identity()
model = model.to(device).eval()


In [13]:
to_tensor = T.Compose([
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])


In [14]:
@torch.no_grad()
def embed_image(x):
    x = x.to(device)
    emb = model(x)
    return emb.squeeze(0).cpu()   # <-- SEM numpy


In [15]:
def embed_series(dicom_files):
    embeddings = []

    for dcm_path in dicom_files:
        img = load_dicom_image(dcm_path)
        x = preprocess_image(img)
        emb = embed_image(x)      # torch.Tensor
        embeddings.append(emb)
    if len(embeddings) == 0:
        return None

    return torch.stack(embeddings).mean(dim=0)



In [16]:
from tqdm import tqdm

series_embeddings = []

for idx, row in tqdm(fs_df.iterrows(), total=len(fs_df)):
    emb = embed_series(row["dicom_files"])

    if emb is None:
        continue

    series_embeddings.append({
        "patient_id": row["patient_id"],
        "split": row["split"],
        "lesion_type": row["lesion_type"],
        "laterality": row["laterality"],
        "view": row["view"],
        "series_variant": row["series_variant"],
        "embedding": emb
    })

emb_df = pd.DataFrame(series_embeddings)


100%|██████████████████████████████████████████████████████████████████████████████| 6671/6671 [24:09<00:00,  4.60it/s]


In [17]:
import pickle

with open("mammo_embeddings.pkl", "wb") as f:
    pickle.dump(emb_df, f)


In [18]:
print(emb_df.shape)
print(emb_df.columns.tolist())
emb_df.head()


(6671, 7)
['patient_id', 'split', 'lesion_type', 'laterality', 'view', 'series_variant', 'embedding']


Unnamed: 0,patient_id,split,lesion_type,laterality,view,series_variant,embedding
0,P_00038,Test,Calc,LEFT,CC,_0,"[tensor(0.0068), tensor(0.1932), tensor(0.), t..."
1,P_00038,Test,Calc,LEFT,CC,_1,"[tensor(0.), tensor(0.2228), tensor(0.), tenso..."
2,P_00038,Test,Calc,LEFT,MLO,_0,"[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
3,P_00038,Test,Calc,LEFT,MLO,_1,"[tensor(0.), tensor(0.1231), tensor(0.), tenso..."
4,P_00038,Test,Calc,RIGHT,CC,_0,"[tensor(0.0029), tensor(0.2364), tensor(0.0077..."


In [22]:
np.save("mammo_embeddings.npy", np.stack(emb_df["embedding"].values))
emb_df.drop(columns=["embedding"]).to_csv("mammo_metadata.csv", index=False)

ImportError: Can't determine version for zstandard

In [24]:
import zstandard
print("MODULE:", zstandard)
print("FILE:", getattr(zstandard, "__file__", None))
print("VERSION:", getattr(zstandard, "__version__", "SEM __version__"))


MODULE: <module 'zstandard' (<_frozen_importlib_external.NamespaceLoader object at 0x000001488026B0D0>)>
FILE: None
VERSION: SEM __version__
