In [35]:
import os
import glob
import pandas as pd

In [None]:
# Build dictionaries and process all CSV files

csv_folder = "./cbis-ddsm-breast-cancer-image-dataset/csv"
indir = "./cbis-ddsm-breast-cancer-image-dataset/jpeg"

# Robust solution derived from Midterms Prototype to solve file error
# File error persisted in calc_case_description_test_set.csv
# Load dicom_info and build UID->path dictionaries
print("Loading dicom_info and building dictionaries...")
dicom_info = pd.read_csv(os.path.join(csv_folder, "dicom_info.csv"))

# Categorize the image paths
full_mammo = dicom_info[dicom_info['SeriesDescription'] == 'full mammogram images']['image_path'].astype(str)
cropped_images = dicom_info[dicom_info['SeriesDescription'] == 'cropped images']['image_path'].astype(str)
roi_img = dicom_info[dicom_info['SeriesDescription'] == 'ROI mask images']['image_path'].astype(str)

# Replace CBIS-DDSM/jpeg with local JPEG directory
full_mammo = full_mammo.str.replace("CBIS-DDSM/jpeg", indir, regex=False)
cropped_images = cropped_images.str.replace("CBIS-DDSM/jpeg", indir, regex=False)
roi_img = roi_img.str.replace("CBIS-DDSM/jpeg", indir, regex=False)

# Build dictionaries
full_mammo_dict = {path.split("/")[3]: path for path in full_mammo}
cropped_images_dict = {path.split("/")[3]: path for path in cropped_images}
roi_img_dict = {path.split("/")[3]: path for path in roi_img}

print(f"  full_mammo_dict: {len(full_mammo_dict)} entries")
print(f"  cropped_images_dict: {len(cropped_images_dict)} entries")
print(f"  roi_img_dict: {len(roi_img_dict)} entries")

# Function to normalize any CBIS base to local JPEG directory (manually creating path)
# The pattern is: ./cbis-ddsm-breast-cancer-image-dataset/jpeg/{UID}/{filename}
def _normalize_path(p):
    try:
        return str(p).replace("CBIS-DDSM/jpeg", indir)
    except Exception:
        return p

# Fallback finder that looks in dicom_info if UID not in dicts
_series_map = {
    "full": "full mammogram images",
    "cropped": "cropped images",
    "roi": "ROI mask images",
}

def find_image_path(original_path: str, series_type: str) -> str:
    try:
        uid = str(original_path).split("/")[2]
        if series_type == "full" and uid in full_mammo_dict:
            return full_mammo_dict[uid]
        if series_type == "cropped" and uid in cropped_images_dict:
            return cropped_images_dict[uid]
        if series_type == "roi" and uid in roi_img_dict:
            return roi_img_dict[uid]

        series_desc = _series_map.get(series_type)
        if series_desc is not None:
            matches = dicom_info[
                (dicom_info['SeriesDescription'] == series_desc) &
                (dicom_info['image_path'].astype(str).str.contains(uid, na=False))
            ]
            if not matches.empty:
                return _normalize_path(matches.iloc[0]['image_path'])

        return _normalize_path(original_path)
    except Exception:
        return _normalize_path(original_path)

# Gather target case_description CSVs
csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))
case_files = [f for f in csv_files if 'case_description' in f and '_cleaned' not in f]

print("Found case description files to process:")
for file in case_files:
    print(f"  - {os.path.basename(file)}")

# Process each case description file
for csv_file in case_files:
    print(f"\nProcessing {os.path.basename(csv_file)}...")

    df = pd.read_csv(csv_file)
    print(f"  Original shape: {df.shape}")

    required_columns = ['image file path', 'cropped image file path', 'ROI mask file path']
    if not all(col in df.columns for col in required_columns):
        print(f"  Skipping {os.path.basename(csv_file)} - missing required columns")
        continue

    failed_rows = []

    # Apply path fixing
    for index, row in df.iterrows():
        try:
            df.at[index, 'image file path'] = find_image_path(row['image file path'], "full")
            df.at[index, 'cropped image file path'] = find_image_path(row['cropped image file path'], "cropped")
            df.at[index, 'ROI mask file path'] = find_image_path(row['ROI mask file path'], "roi")
        except Exception as e:
            failed_rows.append((index, str(e)))

    # Add label column if pathology exists
    if 'pathology' in df.columns:
        label_map = {
            'MALIGNANT': 1,
            'BENIGN': 0,
            'BENIGN_WITHOUT_CALLBACK': 0,
        }
        df['label'] = df['pathology'].replace(label_map)
        print("  Added label column")

    # Save the cleaned file
    output_file = csv_file.replace('.csv', '_cleaned.csv')
    df.to_csv(output_file, index=False)
    print(f"  Saved cleaned file: {os.path.basename(output_file)}")
    print(f"  Final shape: {df.shape}")

print("\nAll files processed!")

Loading dicom_info and building dictionaries...
  full_mammo_dict: 2857 entries
  cropped_images_dict: 3567 entries
  roi_img_dict: 3247 entries
Found case description files to process:
  - calc_case_description_test_set.csv
  - calc_case_description_train_set.csv
  - mass_case_description_test_set.csv
  - mass_case_description_train_set.csv

Processing calc_case_description_test_set.csv...
  Original shape: (326, 14)
  Added label column
  Saved cleaned file: calc_case_description_test_set_cleaned.csv
  Final shape: (326, 15)

Processing calc_case_description_train_set.csv...
  Original shape: (1546, 14)
  Added label column
  Saved cleaned file: calc_case_description_train_set_cleaned.csv
  Final shape: (1546, 15)

Processing mass_case_description_test_set.csv...
  Original shape: (378, 14)
  Added label column
  Saved cleaned file: mass_case_description_test_set_cleaned.csv
  Final shape: (378, 15)

Processing mass_case_description_train_set.csv...
  Original shape: (1318, 14)
  Add

  df['label'] = df['pathology'].replace(label_map)
  df['label'] = df['pathology'].replace(label_map)
  df['label'] = df['pathology'].replace(label_map)
  df['label'] = df['pathology'].replace(label_map)


In [None]:
# Merge all cleaned CSV files into a single combined file
import pandas as pd
import glob
import os

print("=== MERGING ALL CLEANED CSV FILES ===")

# Get all cleaned CSV files
csv_folder = "./cbis-ddsm-breast-cancer-image-dataset/csv"
cleaned_files = glob.glob(os.path.join(csv_folder, "*_cleaned.csv"))

print("Found cleaned files:")
for file in cleaned_files:
    print(f"  - {os.path.basename(file)}")

# Load and combine all cleaned files
combined_data = []

for file in cleaned_files:
    df = pd.read_csv(file)
    print(f"  Shape: {df.shape}")
    
    # Add a source column to identify which file each row came from
    source_name = os.path.basename(file).replace("_cleaned.csv", "")
    df['source_file'] = source_name
    
    combined_data.append(df)

# Combine all dataframes
if combined_data:
    combined_df = pd.concat(combined_data, ignore_index=True)
    
    print(f"\n=== COMBINED DATASET ===")
    print(f"Total rows: {len(combined_df)}")
    print(f"Total columns: {len(combined_df.columns)}")
    
    
    # Save the combined dataset
    output_path = os.path.join(csv_folder, "combined_cleaned.csv")
    combined_df.to_csv(output_path, index=False)
    print(f"\n Saved combined dataset to: {output_path}")
    
    # Show first few rows
    print(f"\nFirst 3 rows of combined dataset:")
    print(combined_df.head(3))
    
else:
    print(" No cleaned files found!")


=== MERGING ALL CLEANED CSV FILES ===
Found cleaned files:
  - calc_case_description_test_set_cleaned.csv
  - calc_case_description_train_set_cleaned.csv
  - mass_case_description_test_set_cleaned.csv
  - mass_case_description_train_set_cleaned.csv
  Shape: (326, 15)
  Shape: (1546, 15)
  Shape: (378, 15)
  Shape: (1318, 15)

=== COMBINED DATASET ===
Total rows: 3568
Total columns: 19

 Saved combined dataset to: ./cbis-ddsm-breast-cancer-image-dataset/csv\combined_cleaned.csv

First 3 rows of combined dataset:
  patient_id  breast density left or right breast image view  abnormality id  \
0    P_00038             2.0                 LEFT         CC               1   
1    P_00038             2.0                 LEFT        MLO               1   
2    P_00038             2.0                RIGHT         CC               1   

  abnormality type             calc type calc distribution  assessment  \
0    calcification  PUNCTATE-PLEOMORPHIC         CLUSTERED           4   
1    calcifica