## Comparing to the ground truth file
Comparing our 650 svs files to the ground truth file to remove file which are invalid and not in the ground_truth csv file

In [1]:
def is_valid_svs_file(filename, excel_path="PatientsSheet.xlsx", columnNumb=0):
    """
    Check if a given .svs filename is valid based on:
    - Matching first 12 characters with patient ID in Excel column 0.
    - Patient's label in specified column is not NaN or empty.

    Args:
        filename (str): Filename to check (should end in .svs).
        excel_path (str): Path to the Excel sheet.
        columnNumb (int): Column number to check for the label (default: 1 for MSI).

    Returns:
        tuple: (bool, value) where bool is True if valid, False otherwise,
               and value is the label from the specified column or None if invalid.
    """
    # Check if filename ends with .svs
    if not filename.lower().endswith('.svs'):
        print(f"Error: Filename '{filename}' does not end with .svs")
        return False, None

    # Check if filename is long enough to extract patient ID
    if len(filename) < 12:
        print(f"Error: Filename '{filename}' is too short (less than 12 characters)")
        return False, None

    # Extract patient ID from first 12 characters
    patient_id = filename[:12]

    try:
        # Read Excel file
        df = pd.read_excel(excel_path, header=None)
    except FileNotFoundError:
        print(f"Error: Excel file '{excel_path}' not found.")
        return False, None
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return False, None

    # Check if DataFrame is empty
    if df.empty:
        print("Error: Excel file is empty.")
        return False, None

    # Handle headers - skip first row if it contains "PATIENTS"
    if len(df) > 0 and str(df.iloc[0, 0]).strip().upper() == "PATIENTS":
        df = df.iloc[1:]

    # Check if we have data after potential header removal
    if df.empty:
        print("Error: No data found after header processing.")
        return False, None

    # Ensure we have enough columns (at least columnNumb + 1 since it's 0-indexed)
    required_columns = max(1, columnNumb + 1)  # At least 1 column, or columnNumb + 1
    if df.shape[1] < required_columns:
        print(f"Error: Excel file must have at least {required_columns} columns. "
              f"Found {df.shape[1]} columns. Cannot access column {columnNumb}.")
        return False, None

    # Clean and standardize patient ID column (always column 0)
    df.iloc[:, 0] = df.iloc[:, 0].astype(str).str.strip()
    
    # Clean and standardize the target column
    df.iloc[:, columnNumb] = df.iloc[:, columnNumb].astype(str).str.strip()

    # Filter for this patient
    match = df[df.iloc[:, 0] == patient_id]

    if match.empty:
        print(f"Patient ID '{patient_id}' not found in Excel file.")
        return False, None

    # Check label validity in specified column
    label = match.iloc[0, columnNumb]
    if pd.isna(label) or str(label).strip().lower() in ['nan', '', 'none']:
        print(f"Patient '{patient_id}' has invalid or empty label in column {columnNumb}: '{label}'")
        return False, None

    print(f"Valid file: Patient '{patient_id}' found with label in column {columnNumb}: '{label}'")
    return True, label

## Magnification

In [2]:
import os
import csv
from pathlib import Path
from collections import Counter
import openslide

def get_slide_metadata(svs_file):
    metadata = {}
    filename = os.path.basename(svs_file)

    try:
        slide = openslide.OpenSlide(svs_file)
        props = slide.properties

        print(f"\n--- Metadata for: {filename} ---")
        for key, value in props.items():
            print(f"{key}: {value}")
            metadata[key] = value

        slide.close()

        file_size_mb = os.path.getsize(svs_file) / (1024 * 1024)
        return metadata, file_size_mb, None
    except Exception as e:
        print(f"\n!!! Failed to open {filename}: {e}")
        return {}, None, str(e)

def check_and_log_svs_metadata(svs_file, label, writer, counts):
    filename = os.path.basename(svs_file)

    name_prefix = filename[:12]  # e.g., 'TCGA-AA-352'
    # Create new name
    new_name = f"{name_prefix}_{label}"

    try:
        slide = openslide.OpenSlide(svs_file)
        props = slide.properties

        # Extract relevant metadata
        objective_power = props.get('openslide.objective-power', 'N/A')
        mpp_x = props.get('openslide.mpp-x', 'N/A')
        mpp_y = props.get('openslide.mpp-y', 'N/A')
        file_size_mb = os.path.getsize(svs_file) / (1024 * 1024)

        # Print key metadata
        print(f"--- Metadata for: {filename} ---")
        print(f"Objective Power: {objective_power}")
        print(f"MPP X: {mpp_x}")
        print(f"MPP Y: {mpp_y}")
        print(f"Size: {file_size_mb:.2f} MB")

        # Status logic
        has_obj = objective_power != 'N/A'
        has_mpp = mpp_x != 'N/A' or mpp_y != 'N/A'

        status = ""
        if has_obj:
            status = "Has Objective Power"
            counts["objective_power"] += 1
            if file_size_mb < 50:
                status += " | <50MB"
                counts["small_files"] += 1
        elif has_mpp:
            status = "Has Only MPP"
            counts["only_mpp"] += 1
            if file_size_mb < 50:
                status += " | <50MB"
                counts["small_files"] += 1
        else:
            status = "Missing Magnification Info"
            counts["no_magnification_info"] += 1

        writer.writerow([
            filename,
            f"{file_size_mb:.2f}",
            status,
            objective_power,
            mpp_x,
            mpp_y,
            label,
            new_name
        ])


        slide.close()

    except Exception as e:
        print(f"\n!!! Failed to open {filename}: {e}")
        counts["failed_to_open"] += 1
        writer.writerow([filename, "N/A", f"Failed to open: {e}", "N/A", "N/A", "N/A", label, new_name])



## stats

In [None]:
# -------------------- MAIN SCRIPT --------------------

wsi_root = "D:/Aamir Gulzar/WSI_Raw_data"
flat_dir = "./FLAT_DIRECTORY"
output_csv = "slide_metadata_log.csv"

counts = Counter()
count = 0

with open(output_csv, mode='w', newline='') as log_file:
    writer = csv.writer(log_file)
    writer.writerow(["Filename", "Size (MB)", "Metadata Status", "Objective Power", "MPP X", "MPP Y", "Label", "renamed"])


    for subdir in os.listdir(wsi_root):
        full_subdir_path = os.path.join(wsi_root, subdir)
        if not os.path.isdir(full_subdir_path):
            continue

        svs_files = [f for f in os.listdir(full_subdir_path) if f.endswith('.svs')]
        if not svs_files:
            print(f"❌ No .svs file found in: {full_subdir_path}")
            continue

        for svs_file in svs_files:
            svs_path = os.path.join(full_subdir_path, svs_file)

            bool_valid, label = is_valid_svs_file(svs_file, "PatientsSheet.xlsx", columnNumb=1)
            if not bool_valid:
                print("\n")
                continue

            new_name = f"{svs_file[:12]}_{label}"
            file_size_bytes = os.path.getsize(svs_path)
            file_size_mb = file_size_bytes / (1024 * 1024)
            print(f"{svs_file}: {file_size_mb:.2f} MB")

            check_and_log_svs_metadata(svs_path, label, writer, counts)
            count += 1

            print("\n")

# -------------------- SUMMARY --------------------
print("\n=== SUMMARY ===")
print(f"Total valid SVS files processed: {count}")
for k, v in counts.items():
    print(f"{k}: {v}")


## Filtering Files

In [None]:
import csv

input_csv = "slide_metadata_log.csv"
output_csv = "slide_metadata_filtered.csv"

with open(input_csv, newline='') as infile, open(output_csv, mode='w', newline='') as outfile:
    reader = csv.DictReader(infile)
    writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
    
    writer.writeheader()

    for row in reader:
        try:
            size_mb = float(row["Size (MB)"])
        except ValueError:
            continue  # Skip invalid size rows

        if row["Metadata Status"] == "Missing Magnification Info":
            continue
        if size_mb < 50:
            continue

        writer.writerow(row)

print(f"✅ Filtered CSV saved as: {output_csv}")