In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
import numpy as np
from numpy.linalg import norm
from sklearn.decomposition import PCA
from PIL import Image as PILImage


# Step 1: Load Pre-trained CNN
base_model = ResNet50(weights='imagenet')
# We'll use the output of the layer just before the final dense layer (usually named 'avg_pool' for ResNet)
model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

# Step 2: Preprocess the Image
def preprocess_image_pillow(img_path):
    img = PILImage.open(img_path)
    img = img.resize((224, 224))  # Resize image to 224x224
    img_array = np.array(img)
    
    # If the image has an alpha channel, we should remove it
    if img_array.shape[2] == 4:
        img_array = img_array[:, :, :3]

    # Convert the image array to float and rescale it
    img_array = img_array.astype(np.float32)

    # Preprocess the image for the model (ResNet in this case)
    # This step depends on the pre-trained model's expected input
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)  # Ensure you use the correct preprocess_input function for your model

    return img_array

# Step 3: Extract Features
def extract_features(img_path, model):
    preprocessed_image = preprocess_image_pillow(img_path)
    features = model.predict(preprocessed_image)
    flattened_features = features.flatten()  # Flatten the features to a 1D array
    return flattened_features

# Step 4: Normalize Features
def normalize_features(features):
    # Normalize feature vector (L2 norm)
    normalized_features = features / np.linalg.norm(features)
    return normalized_features

# Step 5: Reduce Dimensionality
def reduce_dimensionality(features, n_components=300):
    
    pca = PCA(n_components=n_components)
    features = np.array(features)
    pca.fit(features)
    reduced_features = pca.transform(features)
    return reduced_features



# Function to calculate cosine similarity
def cosine_similarity(vec_a, vec_b):
    similarity = np.dot(vec_a, vec_b) / (norm(vec_a) * norm(vec_b))
    return similarity






In [2]:
import json
import os
from urllib.request import urlretrieve

# Load JSON data
with open('product_pool.json', 'r') as file:
    products = json.load(file)

# Create a directory for downloaded images if it doesn't exist
os.makedirs('downloaded_images', exist_ok=True)

# Iterate over products and process each image
for product in products:
    img_url = product['img_url']
    description = product['description']
    
    # Download the image from img_url
    filename = os.path.join('downloaded_images', img_url.split('/')[-1])
    urlretrieve(img_url, filename)

    features = extract_features(filename, model)
    normalized_features = normalize_features(features)
    
    # Add the image vector directly to the product entry
    product['img_vector'] = normalized_features.tolist()  # Convert to list for JSON serialization



print(product['img_vector'])

[0.00314744608476758, 0.1705256551504135, 0.0028325780294835567, 0.005225354805588722, 0.0, 0.0005623740726150572, 0.0224715955555439, 0.024812158197164536, 0.0009043111931532621, 0.007053734268993139, 0.0, 0.0011445643613114953, 0.0, 0.0030989653896540403, 0.006764921825379133, 0.005059754475951195, 0.0, 0.0020971233025193214, 0.0017232147511094809, 0.0, 2.6297348085790873e-05, 0.033984363079071045, 0.004052627366036177, 0.0008597808191552758, 0.021524565294384956, 0.003606569254770875, 0.0, 0.009103907272219658, 0.0007194935460574925, 0.0, 0.0003335608635097742, 0.0666627511382103, 0.003559183096513152, 0.028118887916207314, 0.024043701589107513, 0.010524602606892586, 0.012767955660820007, 0.015029543079435825, 0.0034331975039094687, 0.005458774510771036, 0.005047268234193325, 0.007959816604852676, 0.0, 0.005059977062046528, 0.03526782989501953, 0.0004503423406276852, 0.005506130401045084, 0.045415982604026794, 0.019665397703647614, 0.0016216229414567351, 0.0015268935821950436, 0.006

In [4]:
# Save the updated product pool to a JSON file
with open('updated_product_pool.json', 'w') as outfile:
    json.dump(products, outfile, indent=4)

In [5]:
len(product["img_vector"])

2048