In [17]:
import sys
from pathlib import Path
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [18]:
# Paths
project_root = Path("..").resolve()
data_dir = project_root / "data"
labels_path = data_dir / "labels.csv"
dataset_dir = data_dir / "fashion_dataset"
deepfashion_dir = data_dir / "fashion_dataset" / "DeepFashion2"
fashion_dataset_dir = dataset_dir / 'FashionDataset'
csv_dir = deepfashion_dir / "img_info_dataframes"

# Load filtered CSVs
train_df = pd.read_csv(csv_dir / "train.csv")
val_df = pd.read_csv(csv_dir / "validation.csv")

In [19]:
csv_dir = deepfashion_dir / "img_info_dataframes"
image_root = deepfashion_dir / "deepfashion2_original_images"

train_df = pd.read_csv(csv_dir / "train.csv")
val_df = pd.read_csv(csv_dir / "validation.csv")

# Define the 9 target subcategories
target_subcategories = {
    'long sleeve dress', 'short sleeve dress', 'sling dress', 'vest dress',  # Dresses
    'skirt',                                                                 # Skirt
    'long sleeve outwear', 'short sleeve outwear'                            # Outerwear
}

subcategory_col = "category_name"

# Filter datasets
filtered_train = train_df[train_df[subcategory_col].isin(target_subcategories)]
filtered_val = val_df[val_df[subcategory_col].isin(target_subcategories)]

# Map subcategories to group categories
subcat_to_group = {
    'long sleeve dress': 'Clothing',
    'short sleeve dress': 'Clothing',
    'sling dress': 'Clothing',
    'vest dress': 'Clothing',
    'skirt': 'Clothing',
    'long sleeve outwear': 'Clothing',
    'short sleeve outwear': 'Clothing',
}

def process_split(df, split):
    df = df.copy()
    df["group"] = df[subcategory_col].map(subcat_to_group)
    df["subcategory"] = df[subcategory_col]
    df["split"] = split
    df["filename"] = df["path"].apply(lambda x: Path(x).name)
    df["filepath"] = df["filename"].apply(
        lambda x: str((dataset_dir / "DeepFashion2" / "deepfashion2_original_images" / split / "image" / x).resolve())
    )
    return df[["filename", "group", "subcategory", "split", "filepath"]]

final_df = pd.concat([
    process_split(filtered_train, "train"),
    process_split(filtered_val, "validation")
], ignore_index=True)

final_df.to_csv(labels_path, index=False)
print("labels.csv created:", final_df.shape)
print(final_df.sample(5))

labels.csv created: (112174, 5)
         filename     group         subcategory  split  \
38666  073252.jpg  Clothing               skirt  train   
87916  034737.jpg  Clothing               skirt  train   
51018  175592.jpg  Clothing  short sleeve dress  train   
18958  079150.jpg  Clothing               skirt  train   
76346  110583.jpg  Clothing          vest dress  train   

                                                filepath  
38666  /Users/hossein/Desktop/School/deep_learning_TD...  
87916  /Users/hossein/Desktop/School/deep_learning_TD...  
51018  /Users/hossein/Desktop/School/deep_learning_TD...  
18958  /Users/hossein/Desktop/School/deep_learning_TD...  
76346  /Users/hossein/Desktop/School/deep_learning_TD...  


### ADD Bags & Shoes FROM fashion-dataset


In [20]:
images_dir = fashion_dataset_dir / "images"
styles_csv_path = fashion_dataset_dir / "styles.csv"

styles_df = pd.read_csv(styles_csv_path, on_bad_lines="skip")
styles_df = styles_df.dropna(subset=["articleType", "id"])

articletype_to_slowfashion = {
    "Heels": ("Shoes", "High Heels"),
    "Boots": ("Shoes", "Boots"),
    "Flats": ("Shoes", "Flats"),
    "Clutches": ("Bags", "Clutches"),
    "Handbags": ("Bags", "Shoulder Bags"),
}

# Filter styles based on valid articleTypes
filtered_styles = styles_df[styles_df["articleType"].isin(articletype_to_slowfashion.keys())].copy()

filtered_styles["group"] = filtered_styles["articleType"].map(lambda x: articletype_to_slowfashion[x][0])
filtered_styles["subcategory"] = filtered_styles["articleType"].map(lambda x: articletype_to_slowfashion[x][1])
filtered_styles["filename"] = filtered_styles["id"].astype(str) + ".jpg"
filtered_styles["split"] = "fashion_extra"
filtered_styles["filepath"] = filtered_styles["filename"].apply(lambda x: str(dataset_dir / "FashionDataset" / "images" / x))

fashion_df = filtered_styles[["filename", "group", "subcategory", "split", "filepath"]]

# Combine with DeepFashion2 clothing data
final_combined_df = pd.concat([final_df, fashion_df], ignore_index=True)
# final_combined_df = final_df.copy()
final_combined_df.to_csv(labels_path, index=False)

In [21]:
# Remove rows with missing files
from pathlib import Path

def file_exists(path):
    try:
        abs_path = (project_root / path).resolve()
        return abs_path.exists()
    except:
        return False

final_combined_df = final_combined_df[final_combined_df["filepath"].apply(file_exists)]

# Save clean files
final_combined_df.to_csv(labels_path, index=False)
final_combined_df.to_csv(data_dir / "df_encoded.csv", index=False)

print("Final cleaned labels.csv and df_encoded.csv saved:", final_combined_df.shape)

Final cleaned labels.csv and df_encoded.csv saved: (116046, 5)


In [22]:
final_combined_df.shape[0]

116046

In [23]:
print("Class distribution:\n", final_combined_df["subcategory"].value_counts())

Class distribution:
 subcategory
skirt                   37357
vest dress              21301
short sleeve dress      20338
long sleeve outwear     15468
long sleeve dress        9384
sling dress              7641
Shoulder Bags            1759
High Heels               1323
short sleeve outwear      685
Flats                     500
Clutches                  290
Name: count, dtype: int64


In [None]:
# Load original full dataset
df = pd.read_csv(data_dir / "df_encoded.csv")

# Determine max per class
max_per_class = 1000

# Sample max_per_class rows per class (or all if fewer exist)
balanced_df = df.groupby("subcategory").apply(
    lambda x: x.sample(n=min(len(x), max_per_class), random_state=42)
).reset_index(drop=True)

# Save new balanced version
balanced_path = data_dir / "df_balanced.csv"
balanced_df.to_csv(balanced_path, index=False)

print(f"Balanced dataset saved: {balanced_path}")
print("Class distribution:\n", balanced_df["subcategory"].value_counts())

Balanced dataset saved: /Users/hossein/Desktop/School/deep_learning_TDIS22/hierarchical-image-classification/data/df_balanced_1000.csv
Class distribution:
 subcategory
High Heels              1000
Shoulder Bags           1000
long sleeve dress       1000
long sleeve outwear     1000
short sleeve dress      1000
skirt                   1000
sling dress             1000
vest dress              1000
short sleeve outwear     685
Flats                    500
Clutches                 290
Name: count, dtype: int64


  balanced_df = df.groupby("subcategory").apply(
