In [1]:
import pandas as pd

# Read a single Parquet chunk from your user ratings dataset
df_user_ratings = pd.read_parquet(
    "gs://recomviz_home_and_kitchen/datasets/converted_user_ratings/chunk_00000.parquet",
    engine="pyarrow",
    storage_options={"token": "cloud"}  # Required in Vertex AI
)

# Preview it
df_user_ratings.head()


Unnamed: 0,rating,product_id,user_id
0,1.0,B09XWYG6X1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ
1,5.0,B0BXDLF8TW,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ
2,2.0,B09G2PW8ZG,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ
3,5.0,B08CSZDXZY,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ
4,5.0,B0C6V27S6N,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ


In [2]:
df_item_metadata = pd.read_parquet(
    "gs://recomviz_home_and_kitchen/datasets/converted_item_metadata/chunk_00000.parquet",
    engine="pyarrow",
    storage_options={"token": "cloud"}
)

df_item_metadata.head()


Unnamed: 0,title,product_id,description,images
0,Set of 4 Irish Coffee Glass Mugs Footed 10.5 o...,B07R3DYMH6,[Set of 12 Footed 10.5 oz. Irish coffee mug th...,{'hi_res': ['https://m.media-amazon.com/images...
1,Foaming Soap Dispenser Thick Ceramic Foam Hand...,B0BNZ8Q7YT,[],{'hi_res': ['https://m.media-amazon.com/images...
2,Tapestry Trading 558W90 90 in. European Lace T...,B01508WQC6,[Features. European Lace Tablecloth. 100 Polye...,"{'hi_res': [None], 'large': ['https://m.media-..."
3,jersey seating 2 x Vinyl Air Lift Adjustable S...,B00KKU8HTG,[],"{'hi_res': [None, 'https://m.media-amazon.com/..."
4,Chisander 20 Inches Grey with White Super Soft...,B0B61RJ848,[],{'hi_res': ['https://m.media-amazon.com/images...


In [7]:
import pandas as pd

#######################################
# Create sample from user ratings data
#######################################

# List of chunks to load for user ratings (adjust as needed)
chunk_paths = [
    f"gs://recomviz_home_and_kitchen/datasets/converted_user_ratings/chunk_{i:05}.parquet"
    for i in range(10)  # adjust this range to control size (e.g., 0–10 chunks)
]

# Load and concatenate user ratings chunks
dfs = [
    pd.read_parquet(path, engine="pyarrow", storage_options={"token": "cloud"})
    for path in chunk_paths
]

full_df_ratings = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(full_df_ratings):,} rows of user ratings")

# Create a sample of 100,000 rows from the user ratings data
sample_ratings_df = full_df_ratings.sample(n=100_000, random_state=42)  # adjust as needed
sample_ratings_df.to_parquet("large_sample_user_ratings.parquet", index=False)

# Extract unique product_ids from the user ratings sample
unique_product_ids = sample_ratings_df['product_id'].unique()
print(f"Number of unique product_ids in sample: {len(unique_product_ids):,}")

###########################################################
# Create a consistent sample from item metadata data
###########################################################

# List of chunks to load for item metadata (adjust as needed)
chunk_paths = [
    f"gs://recomviz_home_and_kitchen/datasets/converted_item_metadata/chunk_{i:05}.parquet"
    for i in range(10)  # adjust this range to control size (e.g., 0–10 chunks)
]

# Load and concatenate item metadata chunks
dfs = [
    pd.read_parquet(path, engine="pyarrow", storage_options={"token": "cloud"})
    for path in chunk_paths
]

full_df_item = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(full_df_item):,} rows of item metadata")

# Filter item metadata so that only items found in the user ratings sample are kept
filtered_item_df = full_df_item[full_df_item['product_id'].isin(unique_product_ids)]
print(f"Filtered item metadata contains {len(filtered_item_df):,} rows")

# Optionally, if you want to restrict the sample to a fixed number (e.g., 20,000 rows)
if len(filtered_item_df) > 20_000:
    sample_item_df = filtered_item_df.sample(n=20_000, random_state=42)  # adjust as needed
else:
    sample_item_df = filtered_item_df

sample_item_df.to_parquet("large_sample_item_info.parquet", index=False)


Loaded 1,000,000 rows


In [None]:
import pandas as pd

#######################################
# Use Existing Sample User Ratings Data
#######################################

# Load the existing sample user ratings file (from your long-run algorithm)
sample_ratings_df = pd.read_parquet("sample_user_ratings.parquet")
print(f"Loaded sample user ratings: {len(sample_ratings_df):,} rows")

# Extract unique product IDs from the sample
unique_product_ids = sample_ratings_df['product_id'].unique()
print(f"Unique product_ids in sample: {len(unique_product_ids):,}")

######################################################
# Create a Consistent Sample from Item Metadata Data
######################################################

# List of chunks to load for item metadata (adjust the range as needed)
chunk_paths = [
    f"gs://recomviz_home_and_kitchen/datasets/converted_item_metadata/chunk_{i:05}.parquet"
    for i in range(10)  # adjust this range (e.g., 0–10 chunks)
]

# Load and concatenate item metadata chunks
dfs = [
    pd.read_parquet(path, engine="pyarrow", storage_options={"token": "cloud"})
    for path in chunk_paths
]
full_df_item = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(full_df_item):,} rows of item metadata")

# Filter item metadata so that only items with product IDs that appear in the sample user ratings are kept
filtered_item_df = full_df_item[full_df_item['product_id'].isin(unique_product_ids)]
print(f"Filtered item metadata contains {len(filtered_item_df):,} rows")

# Optionally, if you want to restrict the sample to a fixed number (e.g., 20,000 rows)
if len(filtered_item_df) > 20_000:
    sample_item_df = filtered_item_df.sample(n=20_000, random_state=42)
else:
    sample_item_df = filtered_item_df

# Save the filtered sample to a new Parquet file
sample_item_df.to_parquet("updated_sample_item_info.parquet", index=False)
print("Saved filtered item info sample as 'updated_sample_item_info.parquet'")
