In [2]:
import os
import glob
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Root directory
root_dir = "/home/as2114/code/3DBB/data/dl_challenge"

# Collect all subdirectories
sample_dirs = [d for d in glob.glob(os.path.join(root_dir, "*")) if os.path.isdir(d)]

data = []
for sd in tqdm(sample_dirs, desc="Processing samples"):
    rgb = os.path.join(sd, "rgb.jpg")
    pc = os.path.join(sd, "pc.npy")
    bbox3d = os.path.join(sd, "bbox3d.npy")
    mask = os.path.join(sd, "mask.npy")
    
    # Ensure all files exist
    if all(os.path.exists(f) for f in [rgb, pc, bbox3d, mask]):
        data.append({
            "image": rgb,
            "pc": pc,
            "bbox3d": bbox3d,
            "mask": mask
        })
    else:
        tqdm.write(f"⚠️ Skipping {sd}, missing one or more files")

# Convert to DataFrame
df = pd.DataFrame(data)

# 80-20 split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

# Save CSVs
train_csv = os.path.join(root_dir, "train.csv")
val_csv = os.path.join(root_dir, "val.csv")

train_df.to_csv(train_csv, index=False)
val_df.to_csv(val_csv, index=False)

print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}")
print(f" Saved {train_csv} and {val_csv}")


Processing samples: 100%|██████████| 200/200 [00:00<00:00, 62695.13it/s]

Train samples: 160, Val samples: 40
 Saved /home/as2114/code/3DBB/data/dl_challenge/train.csv and /home/as2114/code/3DBB/data/dl_challenge/val.csv



