In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import cv2
from PIL import Image
from io import BytesIO
from sklearn.metrics.pairwise import cosine_similarity
import requests
from io import BytesIO
from PIL import Image
import time
from tensorflow import keras
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Flatten, InputLayer, BatchNormalization, Dropout
from tensorflow.keras.models import Sequential
#pd.set_option('display.max_colwidth', None)



In [2]:
def retrieve_and_resize_images(df1, target_size=(256, 256)):
    # Assuming 'image_link' and 'id' are the column names in your dataframes
    links1 = df1['image_link'].tolist()
    ids1 = df1['id'].tolist()
    site1 = df1['link'].tolist()

    all_images = []

    # Function to download and resize an image given a URL
    def download_and_resize_image(url, retry_count=2):
        for _ in range(retry_count):
            try:
                response = requests.get(url, timeout=30)
                original_image = Image.open(BytesIO(response.content))
                resized_image = original_image.resize(target_size)
                return resized_image
            except requests.exceptions.RequestException as e:
                print(f"Error downloading image: {e}")
                time.sleep(2) 
        print(f"Failed to download image from {url}")
        return None

    # Retrieve and resize images from the dataframe
    for site, link, image_id in zip(site1,links1, ids1):
        resized_image = download_and_resize_image(link)
        all_images.append({'id': image_id,'link': site ,'resized_image': resized_image})

    return all_images

In [3]:
file_path = '/kaggle/input/new-data/dfn_amazon_id.csv'
amazon_df = pd.read_csv(file_path)
amazon_df = amazon_df.dropna(subset=['name','image_link'])

file_path = '/kaggle/input/new-data/new_aliexpress_hk_products.csv'
ali_df = pd.read_csv(file_path)
ali_df = ali_df.dropna(subset=['name','image_link'])

In [4]:
file_path = '/kaggle/input/matchi/amazon_hk.csv'
amazon_df_2 = pd.read_csv(file_path)
amazon_df_2 = amazon_df_2.dropna(subset=['name','image_link'])

file_path = '/kaggle/input/matchi/aliexpress_hk.csv'
ali_df_2 = pd.read_csv(file_path)
ali_df_2 = ali_df_2.dropna(subset=['name','image_link'])

In [5]:
ali_df_2['id'] = ['aliexpress_hk_' + str(i) for i in range(715, 715 + len(ali_df_2))]
ali_df = pd.concat([ali_df, ali_df_2], ignore_index=True)

In [6]:
amazon_df_2['id'] = ['amazon_hk_' + str(i) for i in range(881, 881 + len(amazon_df_2))]
amazon_df = pd.concat([amazon_df, amazon_df_2], ignore_index=True)

In [7]:
amazon_df.to_csv('amazon_HK_new.csv', encoding='utf-8', index=False)
ali_df.to_csv('aliexpress_HK_new.csv', encoding='utf-8', index=False)

# Code

In [8]:
result_images_amazon = retrieve_and_resize_images(amazon_df)
result_images_ali = retrieve_and_resize_images(ali_df)

In [9]:
# Load the pre-trained EfficientNetV2 model
base_model = keras.applications.EfficientNetV2B0(input_shape=(256, 256, 3), include_top=False, weights='imagenet')
base_model.trainable = False
model = Sequential([
    InputLayer(input_shape=(256, 256, 3)),
    base_model,
    BatchNormalization(),
    Dropout(0.4),
    Flatten(),
    Dense(512, activation="relu"),
    BatchNormalization(),
    Dropout(0.4),
    Dense(256, activation="relu"),
    BatchNormalization(),
    Dropout(0.4),
    Dense(200, activation="linear")
])

# Function to preprocess and predict the features of an image using the model
def get_image_features(image):
    try:
        image = image.convert('RGB')
        image_array = keras.preprocessing.image.img_to_array(image)
        image_array = keras.applications.efficientnet_v2.preprocess_input(image_array)
        image_array = np.expand_dims(image_array, axis=0)
        features = model.predict(image_array)
        features = features.flatten()
        return features
    except Exception as e:
        print(f"Error processing image: {e}")
        return None

# Extract features for each image in result_images_amazon and result_images_ali
amazon_features = []
for img in result_images_amazon:
    features = get_image_features(img['resized_image'])
    if features is not None:
        amazon_features.append(features)

ali_features = []
for img in result_images_ali:
    features = get_image_features(img['resized_image'])
    if features is not None:
        ali_features.append(features)

if not amazon_features or not ali_features:
    print("No valid features to calculate similarity.")
else:
    # Calculate cosine similarity between every pair of images
    similarity_matrix = cosine_similarity(amazon_features, ali_features)

    # Create a DataFrame to store the results for all pairs
    result_df = pd.DataFrame(columns=['Amazon_ID','Amazon_link','Ali_ID','Ali_link','Similarity_Score'])
    
    all_pairs_data = []

    # Add all pairs and their similarity scores to the DataFrame
    for amazon_idx, amazon_id in enumerate(result_images_amazon):
        for ali_idx, ali_id in enumerate(result_images_ali):
            similarity_score = similarity_matrix[amazon_idx, ali_idx]
            all_pairs_data.append({'Amazon_ID': amazon_id['id'],'Amazon_link': amazon_id['link'],'Ali_ID': ali_id['id'],'Ali_link': ali_id['link'], 'Similarity_Score': similarity_score})
    
    result_df = pd.concat([result_df, pd.DataFrame(all_pairs_data)], ignore_index=True)            

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/efficientnet_v2/efficientnetv2-b0_notop.h5






  result_df = pd.concat([result_df, pd.DataFrame(all_pairs_data)], ignore_index=True)


In [10]:
result_df = result_df.sort_values(by='Similarity_Score', ascending=False)

In [11]:
result_df.head(15)

Unnamed: 0,Amazon_ID,Amazon_link,Ali_ID,Ali_link,Similarity_Score
140285,amazon_hk_24,https://www.amazon.com/-/ar/dp/B0BNTMRRSL/ref=...,aliexpress_hk_3341,https://ar.aliexpress.com/item/100500560670924...,0.981062
2327565,amazon_hk_409,https://www.amazon.com/-/ar/dp/B08NFFR587/ref=...,aliexpress_hk_5223,https://ar.aliexpress.com/item/100500529761292...,0.96448
434144,amazon_hk_76,https://www.amazon.com/-/ar/dp/B0BLV8SW11/ref=...,aliexpress_hk_488,https://www.aliexpress.us/item/100500551447331...,0.909154
1124490,amazon_hk_199,https://www.amazon.com/-/ar/dp/B00WHUILD4/ref=...,aliexpress_hk_408,https://www.aliexpress.us/item/100500330973976...,0.890938
1124311,amazon_hk_199,https://www.amazon.com/-/ar/dp/B00WHUILD4/ref=...,aliexpress_hk_229,https://www.aliexpress.us/item/100500626915059...,0.886975
1124616,amazon_hk_199,https://www.amazon.com/-/ar/dp/B00WHUILD4/ref=...,aliexpress_hk_534,https://www.aliexpress.us/item/100500472741729...,0.886291
1198779,amazon_hk_212,https://www.amazon.com/-/ar/dp/B0C1GTV1ZB/ref=...,aliexpress_hk_519,https://www.aliexpress.us/item/100500621601564...,0.81949
28763,amazon_hk_5,https://www.amazon.com/-/ar/dp/B06XKVR8BF/ref=...,aliexpress_hk_233,https://www.aliexpress.us/item/100500531423396...,0.807642
1483345,amazon_hk_261,https://www.amazon.com/-/ar/dp/B0B4X3H4MZ/ref=...,aliexpress_hk_5491,https://ar.aliexpress.com/item/100500378324608...,0.801669
2803418,amazon_hk_493,https://www.amazon.com/-/ar/dp/B0BNMVMCK1/ref=...,aliexpress_hk_1772,https://ar.aliexpress.com/item/100500498557488...,0.799924


In [12]:
result_df.to_csv('matched_products_dl.csv', index=False, encoding='utf-8')

# --------------------------------------------------------------------------------------

In [13]:
import os

def retrieve_resize_and_save_images(df1, target_size=(256, 256), save_path='resized_images'):
    import requests
    from io import BytesIO
    from PIL import Image
    import time
    
    # Create a directory to save the resized images
    os.makedirs(save_path, exist_ok=True)

    # Assuming 'image_link' and 'id' are the column names in your dataframes
    links1 = df1['image_link'].tolist()
    ids1 = df1['id'].tolist()

    all_images = []

    # Function to download, resize, and save an image given a URL
    def download_resize_and_save_image(url, image_id, retry_count=2):
        for _ in range(retry_count):
            try:
                response = requests.get(url, timeout=30)
                original_image = Image.open(BytesIO(response.content))
                
                # Convert the image to RGB mode (remove alpha channel)
                rgb_image = original_image.convert('RGB')
                
                # Resize the image
                resized_image = rgb_image.resize(target_size)
                
                # Save the resized image
                save_filename = os.path.join(save_path, f"{image_id}.jpg")
                resized_image.save(save_filename)
                
                return save_filename
            except requests.exceptions.RequestException as e:
                print(f"Error downloading image: {e}")
                time.sleep(2) 
        print(f"Failed to download image from {url}")
        return None

    # Retrieve, resize, and save images from the dataframe
    for link, image_id in zip(links1, ids1):
        saved_image_path = download_resize_and_save_image(link, image_id)
        if saved_image_path:
            all_images.append({'id': image_id, 'saved_image_path': saved_image_path})

    return all_images

In [14]:
amazon_images = retrieve_resize_and_save_images(amazon_df,save_path='amazon_images')
ali_images = retrieve_resize_and_save_images(ali_df,save_path='ali_images')



In [15]:
pd.DataFrame(amazon_images).to_csv('amazon_img_path.csv')
pd.DataFrame(ali_images).to_csv('ali_img_path.csv')