In [None]:
from PIL import Image
import pandas as pd
import re

import requests
from io import BytesIO
import numpy as np

df = pd.read_csv("data/item_metadata.csv")


images = df["images"][0]
match = re.search(r"https.*?\.jpg", images)

image = match.group(0)

response = requests.get(image)
img = Image.open(BytesIO(response.content))

img_np = np.array(img)
red_total = np.sum(img_np[:, :, 0])
green_total = np.sum(img_np[:, :, 1])
blue_total = np.sum(img_np[:, :, 2])

rgb_totals = np.array([red_total, green_total, blue_total])
print(rgb_totals)

In [19]:
from PIL import Image
import pandas as pd
import re
import requests
from io import BytesIO
import numpy as np

def extract_first_image_rgb(image_field):
    try:
        # Extract the first .jpg URL using regex
        match = re.search(r"https.*?\.jpg", image_field)
        if not match:
            return np.array([0, 0, 0])  # Return zeros if no match found

        url = match.group(0)
        response = requests.get(url, timeout=5)
        img = Image.open(BytesIO(response.content)).convert("RGB")

        img_np = np.array(img)
        red_total = np.sum(img_np[:, :, 0])
        green_total = np.sum(img_np[:, :, 1])
        blue_total = np.sum(img_np[:, :, 2])

        return np.array([red_total, green_total, blue_total])

    except Exception as e:
        # In case of error (e.g. bad URL), return zeros
        return np.array([0, 0, 0])

# Load your data
df = pd.read_csv("data/item_metadata.csv")

# Create a new column with RGB totals
df["image_rgb"] = df["images"].apply(extract_first_image_rgb)

print(df[["images", "image_rgb"]].head())
df.to_csv("item_metadata_with_rgb_array.csv")


                                              images  \
0  {'hi_res': array([None,\n       'https://m.med...   
1  {'hi_res': array(['https://m.media-amazon.com/...   
2  {'hi_res': array([None, None], dtype=object), ...   
3  {'hi_res': array(['https://m.media-amazon.com/...   
4  {'hi_res': array([None], dtype=object), 'large...   

                           image_rgb  
0  [530016400, 518969601, 485087894]  
1  [134428382, 107504677, 102061010]  
2     [12304681, 12211711, 12372245]  
3  [533596856, 517239700, 499358849]  
4     [63019212, 62955333, 62874180]  


In [6]:
import pandas as pd

df = pd.read_csv("data/item_metadata_with_rgb_array.csv")
df2 = pd.read_csv("data/item_metadata_filtered.csv")

print(df.columns)
print(df2.columns)
# Make sure the column exists
if "parent_asin" in df.columns and "parent_asin" in df2.columns:
    user_ids_df1 = set(df["parent_asin"].dropna().unique())
    user_ids_df2 = set(df2["parent_asin"].dropna().unique())

    # In df but not in df2
    only_in_df1 = user_ids_df1 - user_ids_df2
    # In df2 but not in df
    only_in_df2 = user_ids_df2 - user_ids_df1
    # In both
    in_both = user_ids_df1 & user_ids_df2

    print(f"User IDs only in df: {len(only_in_df1)}")
    print(f"User IDs only in df2: {len(only_in_df2)}")
    print(f"User IDs in both: {len(in_both)}")

    # Optional: see examples
    print("\nExamples only in df:", list(only_in_df1)[:5])
    print("Examples only in df2:", list(only_in_df2)[:5])
else:
    print("One or both DataFrames do not contain a 'parent_asin' column.")


Index(['Unnamed: 0', 'parent_asin', 'main_category', 'title', 'average_rating',
       'rating_number', 'price', 'store', 'features', 'description', 'images',
       'categories', 'image_count', 'has_images', 'image_urls', 'category',
       'image_rgb'],
      dtype='object')
Index(['parent_asin', 'main_category', 'title', 'average_rating',
       'rating_number', 'price', 'store', 'features', 'description', 'images',
       'categories', 'image_count', 'has_images', 'image_urls', 'category'],
      dtype='object')
User IDs only in df: 184301
User IDs only in df2: 0
User IDs in both: 77833

Examples only in df: ['B000B8K7V8', 'B06XK5T4YQ', 'B018DT51BC', 'B016356PF4', 'B01MS08Z2N']
Examples only in df2: []


In [7]:
# Filter df to only keep rows with parent_asin present in df2
filtered_df = df[df["parent_asin"].isin(df2["parent_asin"])]

# Save to CSV
filtered_df.to_csv("filtered_item_metadata.csv", index=False)

print("Filtered DataFrame saved as 'filtered_item_metadata.csv'")

Filtered DataFrame saved as 'filtered_item_metadata.csv'


In [9]:
df3 = pd.read_csv("filtered_item_metadata.csv")
user_ids_df1 = set(df3["parent_asin"].dropna().unique())
user_ids_df2 = set(df2["parent_asin"].dropna().unique())

# In df but not in df2
only_in_df1 = user_ids_df1 - user_ids_df2
# In df2 but not in df
only_in_df2 = user_ids_df2 - user_ids_df1
# In both
in_both = user_ids_df1 & user_ids_df2

print(f"User IDs only in df: {len(only_in_df1)}")
print(f"User IDs only in df2: {len(only_in_df2)}")
print(f"User IDs in both: {len(in_both)}")

df3.head()




User IDs only in df: 0
User IDs only in df2: 0
User IDs in both: 77833


Unnamed: 0.1,Unnamed: 0,parent_asin,main_category,title,average_rating,rating_number,price,store,features,description,images,categories,image_count,has_images,image_urls,category,image_rgb
0,17,B07WFSQXL5,All Beauty,PPY Eyelash Growth Serum – Natural Ingredients...,3.9,114.0,,PPY,[],[],{'hi_res': array(['https://m.media-amazon.com/...,[],27,True,['https://m.media-amazon.com/images/I/615N6fkc...,All_Beauty,[539961533 539595036 543002570]
1,21,B08BV6F6BC,All Beauty,Wixar Natural Sea Moss Soap - (2 PACK) - Laven...,4.4,41.0,,WIXAR NATURALS,[],[],{'hi_res': array(['https://m.media-amazon.com/...,[],21,True,['https://m.media-amazon.com/images/I/81ugegqe...,All_Beauty,[222213156 452693709 345014317]
2,28,B07Z818MLY,All Beauty,7 Packs Deep Wave Crochet Hair 22 Inch Deep wa...,3.4,10.0,,Yun Mei Hair,[],[],{'hi_res': array(['https://m.media-amazon.com/...,[],21,True,['https://m.media-amazon.com/images/I/71aVcpK8...,All_Beauty,[341718395 318613809 294284849]
3,30,B071DY8Z4B,All Beauty,BEWAVE Hair Brush Sponge Twist With Comb Hair ...,4.2,24.0,,BEWAVE,[],[],{'hi_res': array(['https://m.media-amazon.com/...,[],17,True,['https://m.media-amazon.com/images/I/61or2jYp...,All_Beauty,[173263892 164677464 165747569]
4,31,B0BTLTVR1X,All Beauty,"Zydeco Chop Chop Cajun Seasoning Base, 8 Ounce...",4.7,21.0,,BORELTH,"['All Natural blend of Dehydrated Onion, Dehyd...",['Zydeco Chop Chop is a blend of Dehydrated On...,{'hi_res': array(['https://m.media-amazon.com/...,[],3,True,['https://m.media-amazon.com/images/I/71707mY6...,All_Beauty,[273693544 266679573 244964869]
