# Sku Generator Beta Testing Playground

Read Apple Numbers file with multiple sheets into Pandas Data frames

In [184]:
import pandas as pd
from numbers_parser import Document

In [185]:
# Read the Numbers file
doc = Document("CHARTS/catalog.numbers")
sheets = doc.sheets
tables = {}

# Extract each sheet as a separate dataframe
for sheet in sheets:
    for table in sheet.tables:
        table_name = table.name.replace(" ", "_")
        data = table.rows(values_only=True)
        tables[table_name] = pd.DataFrame(data[1:], columns=data[0])

# Access individual dataframes
for name, df in tables.items():
    print(f"\n{name}:")
    print(df.head())

#For Product Catalog sheet replace spaces with underscores in column names
tables['Product_Catalog'].columns = tables['Product_Catalog'].columns.str.replace(' ', '_')


Product_Catalog:
   Index Main Category           Sub Category                       Name  \
0    0.0         Shirt              Tee Shirt         Bandit Banquet Tee   
1    1.0         Shirt              Tee Shirt    Bandit Truck Wreath Tee   
2    2.0         Shirt              Tee Shirt      Bandit Truck Logo Tee   
3    3.0         Shirt              Tee Shirt              Legendary Tee   
4    4.0         Shirt  Long Sleeve Tee Shirt  Legendary Long Sleeve Tee   

                       Size     Fit                     Color  \
0  S,M,L,XL,2XL,3XL,4XL,5XL  Unisex                    Yellow   
1  S,M,L,XL,2XL,3XL,4XL,5XL  Unisex                     Black   
2  S,M,L,XL,2XL,3XL,4XL,5XL  Unisex                     Black   
3  S,M,L,XL,2XL,3XL,4XL,5XL  Unisex  White, Blue, Gray, Black   
4  S,M,L,XL,2XL,3XL,4XL,5XL  Unisex                      Blue   

                                    Design Material           Scent  
0                           Bandit Banquet   Cotton  Not Applica

# Prepare Data for parsing

In [186]:
# get lists of code sheets and catalog features
code_sheet_list = list(tables.keys())[2:]  # List of sheet names
catalog_feature_list = list(tables['Product_Catalog'].columns[1:]) #list of features from Product Catalog sheet excluding first column

# remove "Name" from catalog_feature_list if present
if "Name" in catalog_feature_list:
    catalog_feature_list.remove("Name")


# sanity check: make sure both lists match exactly
if set(code_sheet_list) == set(catalog_feature_list):
    print("Sheet names and catalog features MATCH exactly!")
else:
    raise AssertionError("Sheet names and catalog features DO NOT match!")

# for visual reference, create a DF to show matching sheets and features
matching_df = pd.DataFrame({
    'Sheet_Names': code_sheet_list,
    'Catalog_Features': catalog_feature_list
})
matching_df



Sheet names and catalog features MATCH exactly!


Unnamed: 0,Sheet_Names,Catalog_Features
0,Main_Category,Main_Category
1,Sub_Category,Sub_Category
2,Size,Size
3,Fit,Fit
4,Color,Color
5,Design,Design
6,Material,Material
7,Scent,Scent


## Permutation logic
Generate dictionaries all possible permutations of a given prodcut model, extracting the feature, prefix, and index

In [187]:
from itertools import product
import re

def generate_permutations(row):
    """
    For a given product model row, generate all possible permutations of feature combinations.
    Returns a list of dictionaries, where each dictionary represents one SKU permutation.
    Each value in the dict is [value_name, prefix, index] (as strings).
    For AND cases (semicolon), indexes are concatenated and value names are simplified.
    Features with index '0' or prefix ending in 'NAN' are excluded (not applicable values).
    """

    # Size abbreviation to full name mapping
    size_mapping = {
        'CS': 'Child Small',
        'CM': 'Child Medium',
        'CL': 'Child Large',
        'XS': 'Extra Small',
        'S': 'Small',
        'M': 'Medium',
        'L': 'Large',
        'XL': 'Extra Large',
        '2XL': 'Double Extra Large',
        '3XL': 'Triple Extra Large',
        '4XL': 'Quadruple Extra Large',
        '5XL': 'Quintuple Extra Large',
        'S/M': 'Small to Medium',
        'L/XL': 'Large to Extra Large',
        'NA': 'Not Applicable'
    }
    
    # Dictionary to hold all feature values for this product model
    feature_options = {}
    
    for feature in catalog_feature_list:
        feature_values = row[feature]
        
        # Check if feature_values contains comma or semicolon (is a list)
        if isinstance(feature_values, str):
            if ',' in feature_values:
                # Comma-separated = "OR" - these create separate permutations
                values = [v.strip() for v in feature_values.split(',')]
                
                # If this is Size feature, map abbreviations to full names
                if feature == 'Size':
                    values = [size_mapping.get(v, v) for v in values]
                
                feature_options[feature] = {
                    'values': values,
                    'type': 'OR'
                }
            elif ';' in feature_values:
                # Semicolon-separated = "AND" - these must all appear together
                values = [v.strip() for v in feature_values.split(';')]
                
                # If this is Size feature, map abbreviations to full names
                if feature == 'Size':
                    values = [size_mapping.get(v, v) for v in values]
                
                feature_options[feature] = {
                    'values': values,
                    'type': 'AND'
                }
            else:
                # Single value
                value = feature_values
                
                # If this is Size feature, map abbreviation to full name
                if feature == 'Size':
                    value = size_mapping.get(value.strip(), value)
                
                feature_options[feature] = {
                    'values': [value],
                    'type': 'SINGLE'
                }
        else:
            # Non-string value (e.g., NaN, number) - treat as single value
            feature_options[feature] = {
                'values': [feature_values] if not pd.isna(feature_values) else [''],
                'type': 'SINGLE'
            }
    
    # Helper function to get prefix and index for a feature value
    def get_value_info(feature, value_name):
        """Returns [value_name, prefix, index] for a given feature value."""
        code_sheet = tables[feature]
        matching_rows = code_sheet.loc[code_sheet["Name"] == value_name]
        
        if not matching_rows.empty:
            code_index = matching_rows.index[0]
            prefix = code_sheet.at[code_index, "Prefix"]
            # Convert index to string (no padding)
            index_str = str(code_index)
            return [value_name, prefix, index_str]
        else:
            # If not found, return empty prefix and empty index
            return [value_name, "", ""]
    
    # Helper function to check if a feature should be excluded
    def is_not_applicable(value_info):
        """Returns True if the feature is 'Not Applicable' (index 0 or prefix ends with NAN)"""
        index = value_info[2]
        prefix = value_info[1]
        return index == '0' or prefix.endswith('NAN')
    
    # Helper function to simplify value names for AND cases
    def simplify_and_values(value_names):
        """
        Simplifies multiple value names by finding common parts.
        E.g., ['Bandit Truck Icon', 'Bandit Truck Wordmark'] -> 'Bandit Truck Icon AND Wordmark'
        """
        if len(value_names) == 1:
            return value_names[0]
        
        # Find common prefix among all names
        common_prefix = ""
        min_len = min(len(name) for name in value_names)
        
        for i in range(min_len):
            if all(name[i] == value_names[0][i] for name in value_names):
                common_prefix += value_names[0][i]
            else:
                break
        
        # Clean up common prefix (remove trailing spaces/incomplete words)
        common_prefix = common_prefix.rstrip()
        
        # Extract unique parts from each name
        unique_parts = []
        for name in value_names:
            unique_part = name[len(common_prefix):].strip()
            if unique_part:
                unique_parts.append(unique_part)
        
        # Construct simplified name
        if common_prefix and unique_parts:
            return f"{common_prefix} {' AND '.join(unique_parts)}"
        else:
            return ' AND '.join(value_names)
    
    # Separate features by type
    or_features = {k: v['values'] for k, v in feature_options.items() if v['type'] == 'OR'}
    and_features = {k: v['values'] for k, v in feature_options.items() if v['type'] == 'AND'}
    single_features = {k: v['values'] for k, v in feature_options.items() if v['type'] == 'SINGLE'}
    
    # Build permutation components
    permutation_dict = {}
    
    # Add single features (these don't multiply permutations)
    for feature, values in single_features.items():
        permutation_dict[feature] = values
    
    # Add OR features (these DO multiply permutations)
    for feature, values in or_features.items():
        permutation_dict[feature] = values
    
    # Generate all permutations using cartesian product
    feature_names = list(permutation_dict.keys())
    feature_value_lists = [permutation_dict[f] for f in feature_names]
    
    permutations = []
    for combo in product(*feature_value_lists):
        perm = {}
        
        # For each feature in this combination, get [value, prefix, index]
        for feature, value_name in zip(feature_names, combo):
            value_info = get_value_info(feature, value_name)
            
            # Skip if this is a "Not Applicable" value
            if not is_not_applicable(value_info):
                perm[feature] = value_info
        
        # Add AND features (combine into single [simplified_name, prefix, concatenated_indexes])
        for feature, values in and_features.items():
            # Get info for all values
            value_infos = [get_value_info(feature, val) for val in values]
            
            # Check if any of the AND values are "Not Applicable"
            if any(is_not_applicable(info) for info in value_infos):
                continue  # Skip this entire AND feature
            
            # Simplify the value name
            simplified_name = simplify_and_values([info[0] for info in value_infos])
            
            # Use first prefix (assuming they're all the same)
            prefix = value_infos[0][1] if value_infos else ""
            
            # Concatenate indexes (no padding)
            concatenated_index = "".join([info[2] for info in value_infos])
            
            perm[feature] = [simplified_name, prefix, concatenated_index]
        
        # Add the product name
        perm['Name'] = row['Name']
        
        permutations.append(perm)
    
    return permutations


In [188]:

# Test permutation generation with arbitrary row
test_row = tables['Product_Catalog'].iloc[2]
permutations = generate_permutations(test_row)

print(f"Product: {test_row['Name']}")
print(f"Number of permutations: {len(permutations)}\n")


Product: Bandit Truck Logo Tee
Number of permutations: 8



## SKU number generation
Generate unique SKUs based on the feature prefixes and indices. Use a timestamp with millisecond accuracy as a contingincy in the case of duplicate SKUs

In [189]:
# generate a unique sku based on prefixes and indexes
from datetime import datetime
import base36

def generate_sku(dict, add_timestamp=False):
    """
    Generate a SKU from a permutation dictionary.
    
    Parameters:
    - dict: permutation dictionary with feature info
    - add_timestamp: if True, adds a compact timestamp suffix for guaranteed uniqueness
    """
    model_name = dict['Name']
    part_list = []
    sku_prefix = ""

    # Create sku prefix for main prefix, and sub prefix and index
    sku_prefix += dict['Main_Category'][1]
    sku_prefix += dict['Sub_Category'][1]
    sku_prefix += dict['Sub_Category'][2]
    part_list.append(sku_prefix)


    # iterate the rest of the dict items
    for i, key in enumerate(dict.keys()):
        if i >= 2 and key != "Name":  # only the third and proceeding items, exclude "Name"
            # print(f"key: {key}, value: {dict[key]}")
            part = dict[key][1] + dict[key][2]
            part_list.append(part)

    # Base SKU
    sku = "-".join(part_list)
    
    # Add timestamp suffix if requested
    if add_timestamp:
        now = datetime.now()
        unix_ms = int(now.timestamp() * 1000)
        compact_time = base36.dumps(unix_ms)[-6:].upper()
        
        sku = f"{sku}-{compact_time}"
    
    print(f"Sku generated for {model_name} variant: {sku}")

    return (model_name, sku)

In [190]:
# test on an arbitrary permutation
model, sku = generate_sku(permutations[0],True)

len(sku)

Sku generated for Bandit Truck Logo Tee variant: MAUS1-FT1-CL2-MAT1-SZ5-D45-8G0CC5


33

# SKU number persistance
Generate and write generate SKUs to a csv file with other product details. 
Prevent duplicate skus by checking the existing CSV and regenerate skus with timestamp based unique identifiers in this case.

In [197]:
import os

def initialize_output_csv(output_path='Results/catalog_skus.csv'):
    """
    Create a CSV template with the same structure as Product_Catalog if it doesn't exist.
    If the file already exists, do nothing.
    Adds a 'SKU' column which will serve as the index for unique identification.
    
    Parameters:
    - output_path: path where the CSV should be created
    
    Returns:
    - True if file was created, False if it already existed
    """
    
    # Check if file already exists
    if os.path.exists(output_path):
        print(f"CSV already exists at: {output_path}")
        return False
    
    # Get the column names from Product_Catalog and add SKU column
    template_columns = ['SKU'] + tables['Product_Catalog'].columns.tolist()
    
    # Create an empty DataFrame with these columns
    template_df = pd.DataFrame(columns=template_columns)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save to CSV with SKU as index
    template_df.to_csv(output_path, index=False)
    
    print(f"Created new CSV template with columns: {template_columns}")
    print(f"Saved to: {output_path}")
    
    return True

# Test the function
initialize_output_csv()


CSV already exists at: Results/catalog_skus.csv


False

In [198]:
# Check the CSV structure
if os.path.exists('Results/catalog_skus.csv'):
    test_df = pd.read_csv('Results/catalog_skus.csv')
    print(f"CSV columns: {test_df.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(test_df.head())


CSV columns: ['SKU', 'Index', 'Main_Category', 'Sub_Category', 'Name', 'Size', 'Fit', 'Color', 'Design', 'Material', 'Scent']

First few rows:
Empty DataFrame
Columns: [SKU, Index, Main_Category, Sub_Category, Name, Size, Fit, Color, Design, Material, Scent]
Index: []
