In [None]:
# requirements
# pip install pandas numpy pillow requests scikit-learn

In [10]:
import os
import json
import pandas as pd
import numpy as np
from PIL import Image
from io import BytesIO
import requests
from sklearn.model_selection import train_test_split
import os
import json
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm

In [None]:
DATA_DIR = '/Users/shameekphukan/Documents/LSE/ST456/2025-projects-the-gradient-descendants/data'
FASHION_JSON_PATH = os.path.join(DATA_DIR, 'fashion.json')

def convert_to_url(signature):
    '''
    Converts a Pinterest image signature to a URL.
    The signature is a 6-character string that represents the image.
    The URL format is:
    http://i.pinimg.com/400x/{first_two_chars}/{third_fourth_chars}/{fifth_sixth_chars}/{signature}.jpg
    For example, if the signature is 'abcdef', the URL would be:
    http://i.pinimg.com/400x/ab/cd/ef/abcdef.jpg
    
    param:
        signature: str, 6-character string
    return: 
        str, URL of the image
    '''
    return f"http://i.pinimg.com/400x/{signature[0:2]}/{signature[2:4]}/{signature[4:6]}/{signature}.jpg"

def generate_metadata(fashion_list):
    '''
    Generates metadata from the fashion list.
    The metadata includes:
    - product_id: ID of the product
    - scene_id: ID of the scene
    - bbox: bounding box of the product in the scene
    - category: category of the product
    - scene_url: URL of the scene image
    - product_url: URL of the product image
    The metadata is saved as a CSV file.
    param:
        fashion_list: list of dictionaries, each dictionary contains the metadata
    return:
        df: pandas DataFrame, metadata
    '''
    records = []
    for item in tqdm(fashion_list, desc="Generating metadata"):
        row = {
            'product_id': item['product'],
            'scene_id': item['scene'],
            'bbox': item['bbox'],
            'category': item.get('category', 'unknown'),
            'scene_url': convert_to_url(item['scene']),
            'product_url': convert_to_url(item['product'])
        }
        records.append(row)

    df = pd.DataFrame(records)
    df.to_csv(os.path.join(DATA_DIR, 'meta_data.csv'), index=False)
    print("✅ Metadata saved.")
    return df

def is_image_accessible(url):
    '''
    Checks if an image is accessible by sending a GET request to the URL
    '''
    try:
        response = requests.get(url, timeout=3)
        if response.status_code == 200:
            Image.open(BytesIO(response.content))
            return True
    except:
        return False
    return False

def filter_valid_entries(df):
    '''
    Filters the DataFrame to keep only valid entries.
    An entry is valid if both the scene and product images are accessible.
    The filtered DataFrame is saved as a CSV file.
    param:
        df: pandas DataFrame, metadata
    return:
        filtered_df: pandas DataFrame, filtered metadata'''
    print("🔍 Filtering invalid images...")
    valid_flags = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Validating images"):
        valid = is_image_accessible(row['scene_url']) and is_image_accessible(row['product_url'])
        valid_flags.append(valid)

    df['valid'] = valid_flags
    filtered_df = df[df['valid']]
    filtered_df.to_csv(os.path.join(DATA_DIR, 'meta_data_final.csv'), index=False)
    print(f"✅ Filtered: {len(filtered_df)} valid entries saved.")
    return filtered_df

def split_and_save(df):
    '''
    Splits the DataFrame into train, validation, and test sets.
    The splits are saved as CSV files.
    param:
        df: pandas DataFrame, metadata
    return:
        csv_files: list of str, paths to the saved CSV files
    '''
    train_df, val_df, test_df = np.split(df.sample(frac=1, random_state=42), [int(0.8 * len(df)), int(0.9 * len(df))])
    train_df.to_csv(os.path.join(DATA_DIR, 'train_data.csv'), index=False)
    val_df.to_csv(os.path.join(DATA_DIR, 'validate_data.csv'), index=False)
    test_df.to_csv(os.path.join(DATA_DIR, 'test_data.csv'), index=False)
    print("✅ Data split and saved.")

if __name__ == '__main__':
    print("📥 Reading fashion JSON...")
    with open(FASHION_JSON_PATH, 'r') as f:
        fashion_data = [json.loads(line) for line in tqdm(f, desc="Loading JSON")]

    df = generate_metadata(fashion_data)
    valid_df = filter_valid_entries(df)
    split_and_save(valid_df)


📥 Reading fashion JSON...


Loading JSON: 72198it [00:00, 250540.35it/s]
Generating metadata: 100%|██████████| 72198/72198 [00:00<00:00, 421085.04it/s]


✅ Metadata saved.
🔍 Filtering invalid images...


Validating images: 100%|██████████| 72198/72198 [2:22:21<00:00,  8.45it/s]      


✅ Filtered: 72179 valid entries saved.


  return bound(*args, **kwds)


✅ Data split and saved.


In [None]:
def download_valid_images(df, output_dir='/Users/shameekphukan/Documents/LSE/ST456/2025-projects-the-gradient-descendants/data/images'):
    '''
    download images from URLs in the DataFrame and save them locally.
    Args:
        df (pd.DataFrame): DataFrame containing URLs.
        output_dir (str): Directory to save downloaded images.
    '''
    os.makedirs(output_dir, exist_ok=True)

    all_urls = set(df['scene_url']).union(set(df['product_url']))
    url_to_path = {}

    print(f"Downloading {len(all_urls)} images...")

    for url in tqdm(all_urls, desc="Downloading images"):
        try:
            filename = url.split("/")[-1]
            local_path = os.path.join(output_dir, filename)

            if not os.path.exists(local_path):
                response = requests.get(url, timeout=5)
                with open(local_path, 'wb') as f:
                    f.write(response.content)

            url_to_path[url] = local_path

        except Exception as e:
            print(f"[ERROR] Could not download {url}: {e}")
            url_to_path[url] = None

    df['scene_path'] = df['scene_url'].map(url_to_path)
    df['product_path'] = df['product_url'].map(url_to_path)

    df = df.dropna(subset=['scene_path', 'product_path'])
    df.to_csv(os.path.join(DATA_DIR, 'meta_data_local.csv'), index=False)
    print("✅ Local image paths added and saved.")
    return df

In [13]:
valid_df = download_valid_images(valid_df, output_dir=os.path.join(DATA_DIR, 'images'))

Downloading 67530 images...


Downloading images:  73%|███████▎  | 49433/67530 [1:11:43<1228:42:47, 244.43s/it]

[ERROR] Could not download http://i.pinimg.com/400x/83/29/7b/83297ba1318a7eb5c430c6005c1169aa.jpg: HTTPConnectionPool(host='i.pinimg.com', port=80): Read timed out. (read timeout=5)


Downloading images:  83%|████████▎ | 56366/67530 [1:39:37<3:05:45,  1.00it/s]    

[ERROR] Could not download http://i.pinimg.com/400x/26/c8/aa/26c8aa4dbb66f9e32ca6c4bb6c680eab.jpg: HTTPConnectionPool(host='i.pinimg.com', port=80): Read timed out. (read timeout=5)


Downloading images: 100%|██████████| 67530/67530 [1:45:32<00:00, 10.66it/s]  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['scene_path'] = df['scene_url'].map(url_to_path)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['product_path'] = df['product_url'].map(url_to_path)


✅ Local image paths added and saved.


In [14]:
HOME_JSON_PATH = os.path.join(DATA_DIR, 'home.json')

with open(HOME_JSON_PATH, 'r') as f:
    home_data = [json.loads(line) for line in tqdm(f, desc="Loading Home")]

home_df = generate_metadata(home_data)
valid_home = filter_valid_entries(home_df)
split_and_save(valid_home)

Loading Home: 93274it [00:00, 266805.09it/s]
Generating metadata: 100%|██████████| 93274/93274 [00:00<00:00, 424894.07it/s]


✅ Metadata saved.
🔍 Filtering invalid images...


Validating images: 100%|██████████| 93274/93274 [10:36:28<00:00,  2.44it/s]     


✅ Filtered: 93179 valid entries saved.


  return bound(*args, **kwds)


✅ Data split and saved.


In [15]:
home_image_dir = os.path.join(DATA_DIR, 'images/home')
valid_home_df = download_valid_images(valid_home, output_dir=home_image_dir)

Downloading 59198 images...


Downloading images:  40%|███▉      | 23550/59198 [14:48<56:36:33,  5.72s/it]

[ERROR] Could not download http://i.pinimg.com/400x/72/7a/ec/727aec2f3091b24ed89317319bd7873b.jpg: HTTPConnectionPool(host='i.pinimg.com', port=80): Read timed out. (read timeout=5)


Downloading images:  68%|██████▊   | 40401/59198 [24:37<7:43:16,  1.48s/it] 

[ERROR] Could not download http://i.pinimg.com/400x/61/a1/ac/61a1acf1e3dda3bd594ecd0d4d7edc4b.jpg: HTTPConnectionPool(host='i.pinimg.com', port=80): Read timed out. (read timeout=5)


Downloading images: 100%|██████████| 59198/59198 [35:25<00:00, 27.86it/s]  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['scene_path'] = df['scene_url'].map(url_to_path)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['product_path'] = df['product_url'].map(url_to_path)


✅ Local image paths added and saved.


#### All downloaded images are organized as follows:

- /images/ → for fashion

- /images/home/ → for home

In [61]:
import pandas as pd
import numpy as np
import os

# Path to the combined dataset with local paths
local_csv_path = "/Users/shameekphukan/Documents/LSE/ST456/2025-projects-the-gradient-descendants/data/working_data/meta_data_local.csv"
df = pd.read_csv(local_csv_path)

# Clean separation based on 'home' in scene path
fashion_df = df[~df['scene_path'].str.contains('/home/')]

# Split and save helper
def split_and_save(df, name_prefix):
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    train, val, test = np.split(df, [int(0.8 * len(df)), int(0.9 * len(df))])

    DATA_DIR = "/Users/shameekphukan/Documents/LSE/ST456/2025-projects-the-gradient-descendants/data"
    train.to_csv(os.path.join(DATA_DIR, f"{name_prefix}_train_data.csv"), index=False)
    val.to_csv(os.path.join(DATA_DIR, f"{name_prefix}_validate_data.csv"), index=False)
    test.to_csv(os.path.join(DATA_DIR, f"{name_prefix}_test_data.csv"), index=False)

    print(f"✅ Saved: {name_prefix}_train/validate/test_data.csv")

In [62]:
local_csv_path = "/Users/shameekphukan/Documents/LSE/ST456/2025-projects-the-gradient-descendants/data/working_data/home_meta_data_local.csv"
df = pd.read_csv(local_csv_path)
home_df = df[df['scene_path'].str.contains('/home/')]

In [None]:
valid_df.to_csv(os.path.join(DATA_DIR, 'meta_data_local.csv'), index=False)

Unnamed: 0,product_id,scene_id,bbox,category,scene_url,product_url,valid,scene_path,product_path,category_v1
0,0027e30879ce3d87f82f699f148bff7e,cdab9160072dd1800038227960ff6467,"[0.434097, 0.859363, 0.560254, 1.0]",Shoes,http://i.pinimg.com/400x/cd/ab/91/cdab9160072d...,http://i.pinimg.com/400x/00/27/e3/0027e30879ce...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,foootwear
1,0027e30879ce3d87f82f699f148bff7e,14f59334af4539132981b1324a731067,"[0.175269, 0.527773, 0.621485, 0.924899]",Shoes,http://i.pinimg.com/400x/14/f5/93/14f59334af45...,http://i.pinimg.com/400x/00/27/e3/0027e30879ce...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,foootwear
2,0027e30879ce3d87f82f699f148bff7e,e7d32df9f45b691afc580808750f73ca,"[0.588666, 0.638503, 0.750647, 0.761368]",Shoes,http://i.pinimg.com/400x/e7/d3/2d/e7d32df9f45b...,http://i.pinimg.com/400x/00/27/e3/0027e30879ce...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,foootwear
3,0027e30879ce3d87f82f699f148bff7e,c0be585ed21b1a6c6dc9559ebe007ede,"[0.276699, 0.757741, 0.400485, 0.876138]",Shoes,http://i.pinimg.com/400x/c0/be/58/c0be585ed21b...,http://i.pinimg.com/400x/00/27/e3/0027e30879ce...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,foootwear
4,002a6586b8381b5efd39410657630b44,67ed2a06be8a26dc63d7a04d4e1a135f,"[0.154545, 0.144809, 0.809091, 0.784153]","Handbags, Wallets & Cases",http://i.pinimg.com/400x/67/ed/2a/67ed2a06be8a...,http://i.pinimg.com/400x/00/2a/65/002a6586b838...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,accessories
...,...,...,...,...,...,...,...,...,...,...
72193,f47eacd512acfdcfaf5f402809acb333,0a8d63baa456499a3dd2c4886385cf13,"[0.456818, 0.343636, 0.75, 0.629091]",Clothing|Shirts & Tops,http://i.pinimg.com/400x/0a/8d/63/0a8d63baa456...,http://i.pinimg.com/400x/f4/7e/ac/f47eacd512ac...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,topwear
72194,f47eacd512acfdcfaf5f402809acb333,31e3620642773bfeac3e9d1a4976205c,"[0.378297, 0.6209, 0.790087, 0.982056]",Clothing|Shirts & Tops,http://i.pinimg.com/400x/31/e3/62/31e362064277...,http://i.pinimg.com/400x/f4/7e/ac/f47eacd512ac...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,topwear
72195,f940d17f294fec78c913e9536735bea1,cc764ccff4d6e1abb3ea1ecc1503f8bf,"[0.627266, 0.456562, 0.823383, 0.654216]",Clothing Accessories|Sunglasses,http://i.pinimg.com/400x/cc/76/4c/cc764ccff4d6...,http://i.pinimg.com/400x/f9/40/d1/f940d17f294f...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,accessories
72196,fd8de790fe45ef01ebc8fe47a6e80138,a1f27a961c577230d0cea68f6e59b36a,"[0.235259, 0.824805, 0.39719, 0.974153]",Shoes,http://i.pinimg.com/400x/a1/f2/7a/a1f27a961c57...,http://i.pinimg.com/400x/fd/8d/e7/fd8de790fe45...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,foootwear


In [20]:
stl_dataset = "/Users/shameekphukan/Documents/LSE/ST456/2025-projects-the-gradient-descendants/data"
files = [os.path.join(stl_dataset, x) for x in os.listdir(stl_dataset) if x.endswith(".json")]

In [22]:
with open(files[0], "r") as f:
    fashion_category_map = json.load(f)
    
with open(files[1], "r") as f:
    home_category_map = json.load(f)

In [None]:
def categorize(x):
    if x == 'Shoes':
        return 'footwear'
    elif x in ['Clothing|Pants', 'Clothing|Shorts', 'Clothing|Skirts']:
        return 'bottomwear'
    else:
        return 'topwear'

In [23]:
fashion_category_map

{'8870b0d384138a81cfdc91f98d1dde9b': 'Apparel & Accessories|Shoes',
 '70d392390fa258ba9b5d6b83ea29857e': 'Apparel & Accessories|Shoes',
 '2c05db9ff1abbcfc4bfd50dd003aac80': 'Apparel & Accessories|Clothing|Pants',
 '2b2cc3a8b588b7282be0511568b51ee7': 'Apparel & Accessories|Clothing|Shirts & Tops',
 'a141ed588bdaeccf1f695889a5aa7085': 'Apparel & Accessories|Clothing|Shorts',
 '17577154cfadd83e63d9d496b898e46b': 'Apparel & Accessories|Clothing|Pants',
 'd78116e7035b66180c953118edaca68f': 'Apparel & Accessories|Clothing|Shirts & Tops',
 'bfeab05381703b6b29365f7d1c1e8181': 'Apparel & Accessories|Shoes',
 'affb0234f39c54df9f490be0ed6e9ffa': 'Apparel & Accessories|Shoes',
 '46179a069d2546692bb5c31201a9c7ff': 'Apparel & Accessories|Shoes',
 '96f55744b378bea6ee93a080d0eb4be8': 'Apparel & Accessories|Shoes',
 '0d9fcfcd20e8bd29cc2739538bfa18ea': 'Apparel & Accessories|Clothing|Shirts & Tops',
 '77d80bac509c05b8c339afeb7a08e7ca': 'Apparel & Accessories|Clothing|Outerwear|Coats & Jackets',
 '420192

In [43]:
def categorize(category_string):
    """
    Map fine-grained product categories into broad groups:
    topwear, bottomwear, shoes, jewellery, accessories
    """
    if "Shoes" in category_string:
        return "foootwear"
    elif any(sub in category_string for sub in ["Clothing|Pants", "Clothing|Shorts", "Clothing|Skirts"]):
        return "bottomwear"
    elif any(sub in category_string for sub in ["Jewelry", "Earrings", "Necklaces"]):
        return "jewellery"
    elif any(sub in category_string for sub in ["Sunglasses", "Handbags", "Wallets", "Clothing Accessories"]):
        return "accessories"
    else:
        return "topwear"
    


In [44]:
valid_df['category'] = valid_df['product_id'].apply(lambda x: fashion_category_map[x].replace('Apparel & Accessories|', ''))

In [45]:
valid_df

Unnamed: 0,product_id,scene_id,bbox,category,scene_url,product_url,valid,scene_path,product_path
0,0027e30879ce3d87f82f699f148bff7e,cdab9160072dd1800038227960ff6467,"[0.434097, 0.859363, 0.560254, 1.0]",Shoes,http://i.pinimg.com/400x/cd/ab/91/cdab9160072d...,http://i.pinimg.com/400x/00/27/e3/0027e30879ce...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...
1,0027e30879ce3d87f82f699f148bff7e,14f59334af4539132981b1324a731067,"[0.175269, 0.527773, 0.621485, 0.924899]",Shoes,http://i.pinimg.com/400x/14/f5/93/14f59334af45...,http://i.pinimg.com/400x/00/27/e3/0027e30879ce...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...
2,0027e30879ce3d87f82f699f148bff7e,e7d32df9f45b691afc580808750f73ca,"[0.588666, 0.638503, 0.750647, 0.761368]",Shoes,http://i.pinimg.com/400x/e7/d3/2d/e7d32df9f45b...,http://i.pinimg.com/400x/00/27/e3/0027e30879ce...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...
3,0027e30879ce3d87f82f699f148bff7e,c0be585ed21b1a6c6dc9559ebe007ede,"[0.276699, 0.757741, 0.400485, 0.876138]",Shoes,http://i.pinimg.com/400x/c0/be/58/c0be585ed21b...,http://i.pinimg.com/400x/00/27/e3/0027e30879ce...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...
4,002a6586b8381b5efd39410657630b44,67ed2a06be8a26dc63d7a04d4e1a135f,"[0.154545, 0.144809, 0.809091, 0.784153]","Handbags, Wallets & Cases",http://i.pinimg.com/400x/67/ed/2a/67ed2a06be8a...,http://i.pinimg.com/400x/00/2a/65/002a6586b838...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...
...,...,...,...,...,...,...,...,...,...
72193,f47eacd512acfdcfaf5f402809acb333,0a8d63baa456499a3dd2c4886385cf13,"[0.456818, 0.343636, 0.75, 0.629091]",Clothing|Shirts & Tops,http://i.pinimg.com/400x/0a/8d/63/0a8d63baa456...,http://i.pinimg.com/400x/f4/7e/ac/f47eacd512ac...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...
72194,f47eacd512acfdcfaf5f402809acb333,31e3620642773bfeac3e9d1a4976205c,"[0.378297, 0.6209, 0.790087, 0.982056]",Clothing|Shirts & Tops,http://i.pinimg.com/400x/31/e3/62/31e362064277...,http://i.pinimg.com/400x/f4/7e/ac/f47eacd512ac...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...
72195,f940d17f294fec78c913e9536735bea1,cc764ccff4d6e1abb3ea1ecc1503f8bf,"[0.627266, 0.456562, 0.823383, 0.654216]",Clothing Accessories|Sunglasses,http://i.pinimg.com/400x/cc/76/4c/cc764ccff4d6...,http://i.pinimg.com/400x/f9/40/d1/f940d17f294f...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...
72196,fd8de790fe45ef01ebc8fe47a6e80138,a1f27a961c577230d0cea68f6e59b36a,"[0.235259, 0.824805, 0.39719, 0.974153]",Shoes,http://i.pinimg.com/400x/a1/f2/7a/a1f27a961c57...,http://i.pinimg.com/400x/fd/8d/e7/fd8de790fe45...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...


In [46]:
valid_df['category_v1'] = valid_df['category'].apply(lambda x: categorize(x))


In [49]:
valid_df['category_v1'].value_counts()

category_v1
foootwear      22700
bottomwear     18903
topwear        16741
accessories    10895
jewellery       2934
Name: count, dtype: int64

In [50]:
valid_df['category'].value_counts()

category
Shoes                                 22700
Clothing|Pants                        14281
Clothing|Shirts & Tops                11953
Handbags, Wallets & Cases              6319
Clothing|Outerwear|Coats & Jackets     4788
Clothing Accessories|Sunglasses        4576
Clothing|Shorts                        2750
Clothing|Skirts                        1872
Jewelry|Earrings                       1506
Jewelry|Necklaces                      1428
Name: count, dtype: int64

In [26]:
import json
from collections import Counter


categories = list(fashion_category_map.values())
category_counts = Counter(categories)

category_df = pd.DataFrame(category_counts.items(), columns=["Category", "Count"]).sort_values

In [34]:
category_counts

Counter({'Apparel & Accessories|Shoes': 11780,
         'Apparel & Accessories|Clothing|Shirts & Tops': 7506,
         'Apparel & Accessories|Clothing|Pants': 6553,
         'Apparel & Accessories|Handbags, Wallets & Cases': 3255,
         'Apparel & Accessories|Clothing|Outerwear|Coats & Jackets': 2485,
         'Apparel & Accessories|Clothing Accessories|Sunglasses': 1772,
         'Apparel & Accessories|Clothing|Shorts': 1533,
         'Apparel & Accessories|Clothing|Skirts': 1392,
         'Apparel & Accessories|Jewelry|Earrings': 926,
         'Apparel & Accessories|Jewelry|Necklaces': 909})

In [24]:
home_category_map

{'7c0fe47650954ab860cc9ead602f7a94': 'Hardware|Plumbing|Plumbing Fixtures|Faucets',
 'ed09a82ee84b24ac4be3035b252028d2': 'Home & Garden|Decor|Rugs',
 'ef79a9a8dcd167a3c9b703545966c8b4': 'Hardware|Plumbing|Plumbing Fixtures|Faucets',
 'd26d678405be58bae7ca5d973234b56c': 'Furniture|Chairs',
 '4240527929efc886a352b9a53c01b614': 'Home & Garden|Decor|Rugs',
 'ac56098e63a60675952183ac13508921': 'Furniture|Chairs|Table & Bar Stools',
 '96c30487f1e605e52d08aa2f156db092': 'Home & Garden|Lighting|Lamps',
 '4307fe25f0bcbe6f8b4b10deb7231816': 'Home & Garden|Lighting|Lighting Fixtures|Ceiling Light Fixtures',
 '7b3a68fecd9726c2d8c8383e8b9f1b31': 'Furniture|Chairs',
 'c0aa3d44c3a26dd4a7cf380790d115a3': 'Home & Garden|Decor|Rugs',
 '9bfabee4c9ff434ae3b727a39f570277': 'Furniture|Sofas',
 'a73582afa01c46a36bd80da0016b6218': 'Furniture|Chairs',
 'e48935d89c9268f8ba5fcb7b1a434930': 'Home & Garden|Decor|Throw Pillows',
 'cdc7e6da85ec6d55ef2f82c0d6041a16': 'Home & Garden|Decor|Rugs',
 '2402f5a749df126334ed

In [29]:
home_categories = list(home_category_map.values())
home_category_counts = Counter(home_categories)

home_category_df = pd.DataFrame(home_category_counts.items(), columns=["Category", "Count"]).sort_values
home_category_df

<bound method DataFrame.sort_values of                                              Category  Count
0         Hardware|Plumbing|Plumbing Fixtures|Faucets   3522
1                            Home & Garden|Decor|Rugs   6638
2                                    Furniture|Chairs   5796
3                 Furniture|Chairs|Table & Bar Stools   2413
4                        Home & Garden|Lighting|Lamps   2857
5   Home & Garden|Lighting|Lighting Fixtures|Ceili...   5271
6                                     Furniture|Sofas   2453
7                   Home & Garden|Decor|Throw Pillows   4548
8                                 Home & Garden|Decor   2215
9                         Home & Garden|Decor|Mirrors   3627
10  Home & Garden|Decor|Window Treatments|Curtains...   1966>

In [51]:
home_category_counts

Counter({'Home & Garden|Decor|Rugs': 6638,
         'Furniture|Chairs': 5796,
         'Home & Garden|Lighting|Lighting Fixtures|Ceiling Light Fixtures': 5271,
         'Home & Garden|Decor|Throw Pillows': 4548,
         'Home & Garden|Decor|Mirrors': 3627,
         'Hardware|Plumbing|Plumbing Fixtures|Faucets': 3522,
         'Home & Garden|Lighting|Lamps': 2857,
         'Furniture|Sofas': 2453,
         'Furniture|Chairs|Table & Bar Stools': 2413,
         'Home & Garden|Decor': 2215,
         'Home & Garden|Decor|Window Treatments|Curtains & Drapes': 1966})

In [52]:
def categorize_home(category_string):
    """
    Map fine-grained home categories into broad groups:
    furniture, lighting, decor, plumbing
    """
    if "Furniture" in category_string:
        return "furniture"
    elif "Lighting" in category_string or "Lamps" in category_string:
        return "lighting"
    elif "Decor" in category_string or "Mirrors" in category_string or "Rugs" in category_string or "Pillows" in category_string or "Curtains" in category_string:
        return "decor"
    elif "Plumbing" in category_string or "Faucets" in category_string:
        return "plumbing"
    else:
        return "other"


In [55]:
valid_home['category'] = valid_home['product_id'].apply(lambda x: home_category_map[x])
valid_home['category_v1'] = valid_home['category'].apply(lambda x: categorize_home(x))

In [58]:
valid_home['category_v1'].value_counts()

category_v1
decor        41353
furniture    26209
lighting     16414
plumbing      9203
Name: count, dtype: int64

In [59]:
valid_home

Unnamed: 0,product_id,scene_id,bbox,category,scene_url,product_url,valid,scene_path,product_path,category_v1
0,01af7c73a89bfaa01e50024fe251fdb8,3a3fc37d027d1ccfd1170f823c3b0f0e,"[0.457992, 0.423582, 0.639517, 0.599737]",Home & Garden|Decor|Mirrors,http://i.pinimg.com/400x/3a/3f/c3/3a3fc37d027d...,http://i.pinimg.com/400x/01/af/7c/01af7c73a89b...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,decor
1,01d4d2f30b36b3f6077c3f69ebb8eef8,2c4f93b348efbaa2c8c01296b996af91,"[0.122273, 0.625033, 0.636163, 0.905199]",Furniture|Sofas,http://i.pinimg.com/400x/2c/4f/93/2c4f93b348ef...,http://i.pinimg.com/400x/01/d4/d2/01d4d2f30b36...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,furniture
2,01d4d2f30b36b3f6077c3f69ebb8eef8,c5bd944bfb21e21287ed8dab85d68c33,"[0.298888, 0.357771, 0.798124, 0.660031]",Furniture|Sofas,http://i.pinimg.com/400x/c5/bd/94/c5bd944bfb21...,http://i.pinimg.com/400x/01/d4/d2/01d4d2f30b36...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,furniture
3,01d4d2f30b36b3f6077c3f69ebb8eef8,e70e8fd67c576aa00679a5dc6f018c9a,"[0.667065, 0.429842, 1.0, 0.639504]",Furniture|Sofas,http://i.pinimg.com/400x/e7/0e/8f/e70e8fd67c57...,http://i.pinimg.com/400x/01/d4/d2/01d4d2f30b36...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,furniture
4,0421ad3a3ae80cc6cebf6de3e5b38185,b92e37c20732d3f50834a24579818ad4,"[0.053267, 0.58595, 0.310944, 0.979194]",Furniture|Chairs,http://i.pinimg.com/400x/b9/2e/37/b92e37c20732...,http://i.pinimg.com/400x/04/21/ad/0421ad3a3ae8...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,furniture
...,...,...,...,...,...,...,...,...,...,...
93269,f055533bbb1a59258ba91e00d62888ed,0421b074714fde192a981b15b4da096c,"[0.5491, 0.662709, 0.911747, 1.0]",Furniture|Chairs,http://i.pinimg.com/400x/04/21/b0/0421b074714f...,http://i.pinimg.com/400x/f0/55/53/f055533bbb1a...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,furniture
93270,f16bda2d921d2c15074396f056be6ea9,8567df44289efa2073a621434008f6a9,"[0.286751, 0.309449, 0.554109, 0.50488]",Home & Garden|Decor|Mirrors,http://i.pinimg.com/400x/85/67/df/8567df44289e...,http://i.pinimg.com/400x/f1/6b/da/f16bda2d921d...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,decor
93271,f97baad25da8e4366a49ce3a91128b11,553165a07025dadbc0f6e3e10a15ce0c,"[0.630176, 0.80536, 0.849355, 0.956813]",Furniture|Chairs,http://i.pinimg.com/400x/55/31/65/553165a07025...,http://i.pinimg.com/400x/f9/7b/aa/f97baad25da8...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,furniture
93272,fd53dab374909996ed024a010490d3fa,dfd2f0e2592958f9654f2b5f0db3390d,"[0.278464, 0.704051, 0.487682, 0.99249]",Furniture|Chairs|Table & Bar Stools,http://i.pinimg.com/400x/df/d2/f0/dfd2f0e25929...,http://i.pinimg.com/400x/fd/53/da/fd53dab37490...,True,/Users/shameekphukan/Documents/LSE/ST456/2025-...,/Users/shameekphukan/Documents/LSE/ST456/2025-...,furniture


In [60]:
split_and_save(valid_df, "fashion")
split_and_save(valid_home, "home")

  return bound(*args, **kwds)


✅ Saved: fashion_train/validate/test_data.csv
✅ Saved: home_train/validate/test_data.csv


In [None]:
def update_local_paths(csv_path, new_base_path, save_to=None):
    """
    Update the local file paths in a metadata CSV to match a new base path.

    Parameters:
    - csv_path (str): Path to the metadata CSV file.
    - new_base_path (str): The new base directory on the user's system.
    - save_to (str): Optional path to save the updated CSV.

    Returns:
    - pd.DataFrame: The updated DataFrame.
    """
    df = pd.read_csv(csv_path)

    # Update scene_path and product_path if they exist
    for col in ['scene_path', 'product_path']:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: os.path.join(new_base_path, os.path.relpath(x, start=os.path.commonpath([x]))))

    # Optionally save
    if save_to:
        df.to_csv(save_to, index=False)
        print(f"Updated metadata saved to: {save_to}")
    
    return df