In [None]:
import pandas as pd
import random
from faker import Faker
import os
from dotenv import load_dotenv
from ibm_watsonx_ai import APIClient, Credentials
from ibm_watsonx_ai.foundation_models import Embeddings
import csv

In [None]:
fake = Faker()

ont_cities = [
    "Toronto", "Ottawa"
]

# Definitions
brands = ['Zentrax', 'FootFlex', 'StrideOne', 'Loopic', 'RunXpress']
types = ['Running', 'Walking']
classes = ['Men', 'Women']
materials = ['Synthetic', 'Knit']
colors = ['Black', 'White']
arch_supports = ['High', 'Flat']
weather_resistances = ['Waterproof', 'Resistant']
sizes = [round(s, 1) for s in range(6, 13)] + [s + 0.5 for s in range(6, 13)]
store_ids = range(1, 21)

# Helper: create a fake product name
def create_product_name(brand, shoe_type):
    return f"{brand} {random.choice(['Ultra', 'Flex', 'Pro', 'X', 'Max'])} {shoe_type}"

# Helper: create fake keywords
def generate_keywords(shoe_type, material):
    keywords = [shoe_type.lower(), material.lower()]
    keywords += random.sample(['lightweight', 'durable', 'breathable', 'cushioned', 'supportive', 'flexible'], 3)
    return ', '.join(keywords)

def generate_shoe_data(n=500):
    data = []
    used_skus = set()

    for _ in range(n):
        brand = random.choice(brands)
        shoe_type = random.choice(types)
        shoe_class = random.choice(classes)
        material = random.choice(materials)
        size = random.choice(sizes)
        color = random.choice(colors)
        arch = random.choice(arch_supports)
        weather = random.choice(weather_resistances)
        store_id = random.choice(store_ids)
        city = random.choice(ont_cities)
                
        price = round(random.uniform(29.99, 149.99), 2)
        rating = round(random.uniform(3.0, 5.0), 1)
        product_name = create_product_name(brand, shoe_type)

        # Ensure SKU uniqueness
        while True:
            sku = f"{brand[:3].upper()}-{random.randint(1000, 9999)}"
            if sku not in used_skus:
                used_skus.add(sku)
                break

        data.append({
            'SKU': sku,
            'PRODUCT_NAME': product_name,
            'BRAND': brand,
            'CLASS': shoe_class,
            'TYPE': shoe_type,
            'MATERIAL': material,
            'COLOR': color,
            'WEATHER_RESISTANCE': weather,
            'ARCH_SUPPORT': arch,
            'SIZE': size,
            'PRICE': price,
            'RATING': rating,
            'STORE_ID': store_id,
            'CITY': city
        })

    return pd.DataFrame(data)

# Generate and save
df_shoes = generate_shoe_data(500)
# sq_shoes.to_csv("shoes.csv", index=False)
# print("Dataset saved as 'shoes.csv'")

In [None]:
df_shoes.head()

In [None]:
embedding_cols = ['TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT']

In [None]:
df_shoes[embedding_cols].head()

# Generating embedding vetors for the shoes

Combine all embedding columns into a single string for each row, including column names

In [None]:
df_shoes['COMBINED'] = df_shoes.apply(
    lambda row: ' [SEP] '.join([f"{col_name}: {row[col_name]}" for col_name in embedding_cols]), 
    axis=1
)

In [None]:
cols_to_show = ['TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT', 'COMBINED']
df_shoes[cols_to_show].head()

In [None]:
df_shoes.iloc[0]['COMBINED']

Setting up `wx.ai` embedding API connection

In [None]:
load_dotenv(os.getcwd()+"/.env", override=True)
credentials = Credentials(
                url = os.getenv("WATSONX_URL", "https://us-south.ml.cloud.ibm.com"),
                api_key = os.getenv("WATSONX_APIKEY", "")
                )

client = APIClient(credentials)

project_id = os.getenv("WATSONX_PROJECT", "")
client.set.default_project(project_id)

embeddings = Embeddings(
    model_id=client.foundation_models.EmbeddingModels.MULTILINGUAL_E5_LARGE,
    credentials=credentials,
    project_id=project_id,
)

Showing a few sample rows with their embedding vectors

In [None]:
row_combined = df_shoes['COMBINED'].tolist()
shoe_vectors = embeddings.embed_documents(texts=row_combined)
df_shoes['EMBEDDING'] = shoe_vectors
df_shoes['EMBEDDING'] = df_shoes['EMBEDDING'].apply(lambda x: '[' + ', '.join(map(str, x)) + ']')
df_shoes.drop(['COMBINED'], axis=1, inplace=True)

In [None]:
cols_to_show = ['TYPE', 'MATERIAL', 'COLOR', 'WEATHER_RESISTANCE', 'ARCH_SUPPORT', 'EMBEDDING']
df_shoes[cols_to_show].head()

In [None]:
# df_shoes.iloc[0]['EMBEDDING']

In [None]:
df_shoes.columns

Save the shoes dataframe into a .csv file

In [None]:
df_shoes.to_csv(
    'shoes-vectors.csv',
    index=False,
    quoting=csv.QUOTE_NONNUMERIC
)