In [4]:
import os
from pathlib import Path


def pprint_folders(directory, indent=0):
    # Convert the directory to a Path object if it's not already
    directory = Path(directory)

    if not directory.is_dir():
        print("The provided path is not a directory.")
        return

    # Print the current directory name with indentation
    print("  " * indent + f"- {directory.name}/")

    # Iterate over each item in the directory
    for item in directory.iterdir():
        if item.is_dir():
            # Recursively print the structure of subdirectories
            pprint_folders(item, indent + 1)


# Example usage
data_dir = Path("/mnt/nfs/open_datasets/autopet2024/")
pprint_folders(data_dir)

- autopet2024/
  - labelsTr/
  - imagesTr/


In [29]:
import os
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

# Define the directory path
data_dir = Path("/mnt/nfs/open_datasets/autopet2024/")

# List the contents of the directory
subject_list = os.listdir(data_dir / "labelsTr")

# Create a DataFrame with the contents and set the column name to "Subjects"
imagesTr_df = pd.DataFrame(subject_list, columns=["Subjects"])

# Create two new columns "FDG" and "PSMA" based on the prefix of the "Subjects" column
FDG = imagesTr_df["Subjects"].apply(
    lambda x: x.split(".nii")[0] if x.lower().startswith("fdg") else None
)
PSMA = imagesTr_df["Subjects"].apply(
    lambda x: x.split(".nii")[0] if x.lower().startswith("psma") else None
)

# Drop NaN values
FDG = FDG.dropna()
PSMA = PSMA.dropna()

# Print lengths
print(f"FDG: {len(FDG)}")
print(f"PSMA: {len(PSMA)}")

# Split FDG and PSMA into training and testing sets (90% train, 10% test)
FDG_train, FDG_test = train_test_split(FDG, test_size=0.1, random_state=42)
PSMA_train, PSMA_test = train_test_split(PSMA, test_size=0.1, random_state=42)

# Combine train sets and test sets
train_df = pd.DataFrame({"Subjects": pd.concat([FDG_train, PSMA_train])})
test_df = pd.DataFrame({"Subjects": pd.concat([FDG_test, PSMA_test])})

# Shuffle the rows
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

# Display the resulting DataFrames
print(f"Training Set: {len(train_df)}")
# print(train_df.head())
print(f"\nTesting Set:{len(test_df)}")
# print(test_df.head())


# Function to save DataFrame to CSV without overwriting
def save_csv(df, file_path):
    if not file_path.exists():
        df.to_csv(file_path, index=False)
        print(f"File saved: {file_path}")
    else:
        print(f"File already exists and will not be overwritten: {file_path}")


# Save DataFrames to CSV without overwriting
save_csv(train_df, Path("data/train.csv"))
save_csv(test_df, Path("data/test.csv"))

#

1014
597
Training Set: 1449

Testing Set:162
File already exists and will not be overwritten: data/train.csv
File already exists and will not be overwritten: data/test.csv
