In [2]:
import kagglehub
import pandas as pd
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Download latest version
path = kagglehub.dataset_download("polartech/nike-sportwear-product-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\16155\.cache\kagglehub\datasets\polartech\nike-sportwear-product-dataset\versions\1


In [4]:
# Load dataset file(s) into a pandas DataFrame
# Assuming there's a CSV file in the folder
for file in os.listdir(path):
    if file.endswith('.csv'):
        nike_df = pd.read_csv(os.path.join(path, file))
        print(f"Loaded {file} into DataFrame.")
        break

Loaded Nike_UK_2022-09-01.csv into DataFrame.


In [5]:
# Filter to only clothes
nike_df = nike_df[nike_df["PRODUCT_TYPE"] == "APPAREL"]
# Pull item functions from last word of item
nike_df.loc[:, "Item function"] = nike_df["TITLE"].str.split().str[-1]
nike_df = nike_df[nike_df["Item function"] == "Shirt"]
# nike_df.head()

############## Relevant cols for final DF
# # SKU, BRAND, Item function, PRICE_CURRENT, vibe (None!)
##############################

In [6]:
# Download latest version
path = kagglehub.dataset_download("joyshil0599/h-and-m-sports-apparel-data-set9k")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\16155\.cache\kagglehub\datasets\joyshil0599\h-and-m-sports-apparel-data-set9k\versions\1


In [7]:
# Load dataset file(s) into a pandas DataFrame
# Assuming there's a CSV file in the folder
for file in os.listdir(path):
    if file.endswith('.csv'):
        hm_df = pd.read_csv(os.path.join(path, file))
        print(f"Loaded {file} into DataFrame.")
        break

Loaded Sports_H_and_M.csv into DataFrame.


In [8]:
# Pull item functions from last word of item
hm_df.loc[:, "Item function"] = hm_df["Name_of_product"].str.split().str[-1]
# hm_df.head()

############## Relevant cols for final DF
# # SKU (None!), brand_name, Item function, price_of_product(in dollar), vibe (None!)
##############################

In [9]:
# Download latest version
path = kagglehub.dataset_download("whenamancodes/adidas-us-retail-products-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\16155\.cache\kagglehub\datasets\whenamancodes\adidas-us-retail-products-dataset\versions\1


In [10]:
# Load dataset file(s) into a pandas DataFrame
# Assuming there's a CSV file in the folder
for file in os.listdir(path):
    if file.endswith('.csv'):
        adidas_df = pd.read_csv(os.path.join(path, file))
        print(f"Loaded {file} into DataFrame.")
        break

Loaded adidas.csv into DataFrame.


In [11]:
# Pull item functions from last word of item
adidas_df.loc[:, "Item function"] = adidas_df["name"].str.split().str[-1]
# adidas_df.head()

############## Relevant cols for final DF
# # sku, brand, Item function, selling_price, vibe (None!)
##############################

In [12]:
# Step 1: Map columns in each DataFrame to a common column name
nike_df.rename(columns={'SKU': 'sku', 'BRAND': 'brand', 'Item function': 'function', 'PRICE_CURRENT': 'price'}, inplace=True)
hm_df.rename(columns={'brand_name': 'brand', 'Item function': 'function', 'price_of_product(in dollar)': 'price'}, inplace=True)
adidas_df.rename(columns={'sku': 'sku', 'brand': 'brand', 'function': 'Item function', 'selling_price': 'price'}, inplace=True)

# Step 2: Ensure each DataFrame has all the necessary columns ('sku', 'brand', 'Item function', 'price')
columns_of_interest = ['sku', 'brand', 'function', 'price']

# Manually add missing columns with None if they do not exist in the DataFrame
for df in [nike_df, hm_df, adidas_df]:
    for column in columns_of_interest:
        if column not in df.columns:
            df[column] = None

# Step 3: Select only the relevant columns and concatenate the DataFrames
df1 = nike_df[columns_of_interest]
df2 = hm_df[columns_of_interest]
df3 = adidas_df[columns_of_interest]

apparel_data = pd.concat([df2, df3], ignore_index=True)

# Step 4: Store the final DataFrame as a CSV with index
apparel_data.to_csv('final_merged_data.csv', index=False)
apparel_data['descriptive_tokens'] = None

# Preview the final DataFrame
apparel_data.head()

Unnamed: 0,sku,brand,function,price,descriptive_tokens
0,,H&M,Shorts,12.99,
1,,H&M,Joggers,39.99,
2,,H&M,Joggers,39.99,
3,,H&M,Hoodie,64.99,
4,,H&M,Shorts,12.99,


In [13]:
# # Step 4: Store the final DataFrame as a CSV with index
# apparel_data.to_csv('apparel_data.csv', index=False)

# # Preview the final DataFrame
# print(apparel_data)

In [14]:
apparel_data["function"].unique()

array(['Shorts', 'Joggers', 'Hoodie', 'Jacket', 'Vest', 'Shirt', 'Pants',
       'Parka', 'Windbreaker', 'Belt', 'Gaiters', 'DryMove', None],
      dtype=object)

In [15]:
combos = [['Jacket', 'Shirt', 'Pants'],
          ['Jacket', 'Shirt', 'Shorts'],
          ['Shirt', 'Joggers']]

In [25]:
import random
from collections import defaultdict
from tqdm import tqdm 

In [23]:
# Use mini dataset
feature_data = apparel_data[apparel_data["function"].isin(["Shorts", "Joggers", "Tank", "Hoodie", "Jacket", "Pants", "Shirt"])]
feature_data = feature_data.reset_index()

In [26]:
# Precompute indices for each item in 'function' column
function_to_indices = {func: feature_data[feature_data['function'] == func].index.tolist() 
                       for func in feature_data['function'].unique()}

# Initialize the combo_counts hashmap (use defaultdict to simplify counting)
combo_counts = defaultdict(int)

# Iterate over the combos and generate the list of indexes
num_repeats = 10000000
for _ in tqdm(range(num_repeats)):
    for combo in combos:
        combo_indexes = []
        for item in combo:
            # Get the precomputed indices for the current item
            indices = function_to_indices.get(item, [])
            if indices:
                selected_index = random.choice(indices)
                combo_indexes.append(selected_index)
            else:
                combo_indexes.append(None)  # If no item is found, add None
        
        # Convert the combo indexes to a tuple and update the count
        combo_tuple = tuple(combo_indexes)
        combo_counts[combo_tuple] += 1

# Output the hashmap with combo counts
print(len(combo_counts))


100%|██████████| 10000000/10000000 [01:14<00:00, 134856.95it/s]

22749310





In [27]:
buckets = [x for x in combo_counts.keys()]

In [28]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.sparse import lil_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from scipy.sparse import coo_matrix

In [29]:
# Step 1: Initialize lists to hold the row indices, column indices, and values
rows = []
cols = []
values = []

# Step 2: Build the co-occurrence matrix from the "buckets" data
for bucket in tqdm(buckets, desc="Building Co-Occurrence Matrix"):
    # Iterate over each pair of items in the bucket
    for i in range(len(bucket)):
        for j in range(i + 1, len(bucket)):
            item_i, item_j = bucket[i], bucket[j]
            # Append the indices and value (1 for co-occurrence) to the lists
            rows.append(item_i)
            cols.append(item_j)
            values.append(1)  # Co-occurrence happens once for each pair
            
            rows.append(item_j)
            cols.append(item_i)
            values.append(1)  # Symmetric co-occurrence

# Step 3: Create the sparse co-occurrence matrix using COO format
num_items = len(feature_data)  # Number of items in the dataset
co_occurrence_matrix = coo_matrix((values, (rows, cols)), shape=(num_items, num_items), dtype=np.float32)

# Output the matrix shape and some statistics (optional)
print(f"Co-occurrence matrix shape: {co_occurrence_matrix.shape}")
print(f"Non-zero entries: {co_occurrence_matrix.nnz}")


Building Co-Occurrence Matrix: 100%|██████████| 22749310/22749310 [00:48<00:00, 466890.78it/s]


Co-occurrence matrix shape: (6994, 6994)
Non-zero entries: 125285048


In [81]:
num_items

6994

In [30]:
# Step 1: Build the co-occurrence matrix (could be sparse)
# Assuming you already have a sparse matrix `co_occurrence_matrix`
svd = TruncatedSVD(n_components=50)  # Reduce to 50 dimensions
reduced_matrix = svd.fit_transform(co_occurrence_matrix)

# Step 2: Apply K-means clustering in reduced space
kmeans = KMeans(n_clusters=20)  # Set number of clusters
kmeans.fit(reduced_matrix)

# Step 3: Get the cluster labels
labels = kmeans.labels_

k_menas_labels = kmeans.fit_predict(reduced_matrix)


In [31]:
# Make an example closet
closet = set()
for i, (key, val) in enumerate(combo_counts.items()):
    if i == 50: 
        break
    
    for item in key:
        closet.add(k_menas_labels[item])
    
closet

{np.int32(0),
 np.int32(1),
 np.int32(2),
 np.int32(3),
 np.int32(4),
 np.int32(6),
 np.int32(7),
 np.int32(8),
 np.int32(9),
 np.int32(10),
 np.int32(11),
 np.int32(12),
 np.int32(13),
 np.int32(14),
 np.int32(15),
 np.int32(16),
 np.int32(17),
 np.int32(18),
 np.int32(19)}

In [32]:
# Calculate the number of fits the current closet can make
num_fits_init = 0
init_fits = []
for key, val in combo_counts.items():
    if set(key).issubset(closet):
        num_fits_init += 1
        init_fits.append(key)

print(num_fits_init)

num_fits_new = 0
closet_plus = closet.copy()
closet_plus.add(40)
for key, val in combo_counts.items():
    if set(key).issubset(closet_plus):
        num_fits_new += 1

print(num_fits_new)

25
28


In [65]:
# Step 1: Build inverted index: item → set of combo keys (tuples)
item_to_combos = defaultdict(set)
for combo in combo_counts:
    for item in combo:
        item_to_combos[item].add(combo)

random_integers = random.sample(range(len(feature_data)), 500)
samples_scores = []

for r in random_integers:
    curr_closet = closet.copy()
    curr_closet.add(r)

    # Step 2: Get candidate combos (those that share any item in closet)
    candidate_combos = set()
    for item in curr_closet:
        candidate_combos.update(item_to_combos.get(item, []))
    
    # Step 3: Check only those combos
    curr_score = 0
    for combo in candidate_combos:
        if set(combo).issubset(curr_closet):
            curr_score += combo_counts[combo]
    
    samples_scores.append(curr_score)

In [76]:
def calculate_percentile(data, quantity):
    sorted_data = np.sort(data)  # Sort the data
    num_values = len(sorted_data)
    
    # Count how many values are <= the quantity
    count = np.sum(sorted_data <= quantity)
    
    # Calculate the percentile rank
    percentile_rank = (count / num_values) * 100
    
    return percentile_rank

calculate_percentile(samples_scores, 30)

np.float64(82.8)

In [80]:
# Convert to NumPy array for sorting
arr_np = np.array(samples_scores)

# Get sorted indices by descending value
sorted_indices = np.argsort(-arr_np)

# Get sorted array using the indices
sorted_arr = arr_np[sorted_indices]

print("Sorted array:", sorted_arr.tolist())
print("Original indices:", sorted_indices.tolist())

Sorted array: [31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 2

In [None]:
## Find similar items -- Hard maybe