In [None]:
import pandas as pd
import numpy as np
import os
import glob
import pickle
import tensorflow
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet import ResNet152, preprocess_input
from tensorflow.keras.applications import VGG19
from tensorflow.keras.layers import GlobalMaxPooling2D
from numpy.linalg import norm
from sklearn.metrics.pairwise import pairwise_distances
import tqdm


In [None]:
# get image path and image name (article_id) from folders
path = 'hm_data_path/images/*/*'
def get_img_path_from_folder(folder_path):
    file_paths = []
    file_names = []
    for filename in glob.glob(folder_path):
        if filename.endswith(".jpg"):
            file_paths.append(filename)
            file_name = os.path.splitext(os.path.basename(filename))[0]
            file_names.append(file_name)
    return file_paths, file_names

img_paths, img_names = get_img_path_from_folder(path)

In [None]:
# extract image emebddings using pre-trained weights from ResNet-152 / VGG19
def extract_embedding(img_path, algo='ResNet-152'):
    if algo == 'ResNet-152':
        model = ResNet152(weights='imagenet',include_top=False,input_shape=(224,224,3))
    elif algo == 'VGG19':
        model = VGG19(weights='imagenet',include_top=False,input_shape=(224,224,3))
    model.trainable = False
    model = tensorflow.keras.Sequential([
        model,
        GlobalMaxPooling2D()
    ])

    img = image.load_img(img_path,target_size=(224,224))
    img_array = image.img_to_array(img)
    expanded_img_array = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(expanded_img_array)
    result = model.predict(preprocessed_img).flatten()
    normalized_result = result / norm(result)

    return normalized_result



In [None]:
def extract_similar(img_feat, metric = 'euclidean'):
    top_sim = []
    top_dist = []
    for j in tqdm.tqdm(range(len(img_feat))):
        eucl_dist = pairwise_distances(list(img_feat.img_embedding), img_feat.iloc[j,1].reshape(1,-1),metric=metric)
        pdists  = np.sort(eucl_dist.flatten())[1]
        indices = np.argsort(eucl_dist.flatten())[1]
        top_sim.append(img_feat.iloc[indices].article_id)
        top_dist.append(pdists) 
    
    return top_sim, top_dist

In [None]:
embeddings = []
for file in img_paths:
    embeddings.append(extract_features(file,'ResNet-152'))
img_feat = pd.DataFrame(img_names, columns=['article_id'])
img_feat['article_id'] = [img_feat.article_id[i][1:] for i in range(len(img_feat.article_id))]
img_feat['img_embedding'] = embeddings

In [None]:
top_sim, top_dist = extract_similar(img_feat, 'euclidean')
img_feat['top_sim'] = top_sim
img_feat['top_dist'] = top_dist
img_feat.to_csv('path/file_name.csv')

# Alibaba iFashion Dataset

In [None]:
model = ResNet152(weights='imagenet',include_top=False,input_shape=(224,224,3))
model.trainable = False

model = tensorflow.keras.Sequential([
    model,
    GlobalMaxPooling2D()
])

def extract_features(img_path,model):
    try:
        url = get_file(fn,origin=img_path,extract=True)
        img = image.load_img(url,target_size=(224,224))
        os.remove(url)
        img_array = image.img_to_array(img)
        expanded_img_array = np.expand_dims(img_array, axis=0)
        preprocessed_img = preprocess_input(expanded_img_array)
        result = model.predict(preprocessed_img).flatten()
        normalized_result = result / norm(result)
    except:
        normalized_result = np.nan
    
    return normalized_result

In [None]:
ali_item = pd.read_csv('path/alibaba_data/item_data.txt',header=None,delimiter=',',on_bad_lines='skip',
                      names=['item_id', 'cateID', 'imgLink', 'title'])
# fix the wrong url links
wrongurl = ali_item.imgLink.str.startswith('//')
ali_item.loc[wrongurl, 'imgLink'] = 'http:' + ali_item.loc[wrongurl, 'imgLink']

In [None]:
feature_list = []
ali = ali_item.iloc[:430000,:].reset_index(drop=True)
for i in tqdm.tqdm(range(len(ali))):
    fn = f'img{i}'
    feature_list.append(extract_features(ali.iloc[i,2],model))

In [None]:
pickle.dump(feature_list,open('path/ali_embeddings.pkl','wb'))