In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# 1. Data Load

## rating data

In [4]:
df_ratings = pd.read_csv('AMAZON_FASHION.csv', names=['asin','reviewerID','rating','unixReviewTime'])
df_ratings

Unnamed: 0,asin,reviewerID,rating,unixReviewTime
0,7106116521,A1D4G1SNUZWQOT,5.0,1413763200
1,7106116521,A3DDWDH9PX2YX2,2.0,1411862400
2,7106116521,A2MWC41EW7XL15,4.0,1408924800
3,7106116521,A2UH2QQ275NV45,2.0,1408838400
4,7106116521,A89F3LQADZBS5,3.0,1406419200
...,...,...,...,...
883631,B01HJHTH5U,A1ZSB2Q144UTEY,5.0,1487635200
883632,B01HJHTH5U,A2CCDV0J5VB6F2,5.0,1480032000
883633,B01HJHTH5U,A3O90PACS7B61K,3.0,1478736000
883634,B01HJHF97K,A2HO94I89U3LNH,3.0,1478736000


In [5]:
print("상품 unique : ", df_ratings['asin'].nunique())
print("리뷰어 unique : ", df_ratings['reviewerID'].nunique())

상품 unique :  186189
리뷰어 unique :  749233


## metadata

In [7]:
df_meta = pd.read_json('meta_AMAZON_FASHION.json', lines=True)
df_meta = df_meta[['asin','title','imageURLHighRes']]
# imageURLHighRes가 있는 row들만 필터링
df_meta = df_meta[~df_meta['imageURLHighRes'].isna()]
df_meta

Unnamed: 0,asin,title,imageURLHighRes
0,0764443682,Slime Time Fall Fest [With CDROM and Collector...,[https://images-na.ssl-images-amazon.com/image...
1,1291691480,XCC Qi promise new spider snake preparing men'...,[https://images-na.ssl-images-amazon.com/image...
2,1940280001,Magical Things I Really Do Do Too!,[https://images-na.ssl-images-amazon.com/image...
3,1940735033,"Ashes to Ashes, Oranges to Oranges",[https://images-na.ssl-images-amazon.com/image...
4,1940967805,Aether & Empire #1 - 2016 First Printing Comic...,[https://images-na.ssl-images-amazon.com/image...
...,...,...,...
186632,B01HJGXL4O,JT Women's Elegant Off Shoulder Chiffon Maxi L...,[https://images-na.ssl-images-amazon.com/image...
186633,B01HJHF97K,Microcosm Retro Vintage Black Crochet Lace One...,[https://images-na.ssl-images-amazon.com/image...
186634,B01HJGJ9LS,Lookatool Classic Plain Vintage Army Military ...,[https://images-na.ssl-images-amazon.com/image...
186635,B01HJHTH5U,Edith Windsor Women's Deep V-neck Beaded Sequi...,[https://images-na.ssl-images-amazon.com/image...


## rating & meta data merge

### rating table 평점 5개 이상인 유저만 필터링

In [8]:
# 'reviewerID' 기준으로 5 이상인 값을 가지는 유저를 필터링
reviewer_counts = df_ratings['reviewerID'].value_counts()
reviewer_filter = reviewer_counts[reviewer_counts >= 5].index # 5 이상으로 필터링하기로 협의

# 필터링 조건을 만족하는 데이터 추출
filtered_df = df_ratings[df_ratings['reviewerID'].isin(reviewer_filter)]

In [9]:
print("상품 unique : ", filtered_df['asin'].nunique())
print("리뷰어 unique : ", filtered_df['reviewerID'].nunique())

상품 unique :  13197
리뷰어 unique :  3718


In [10]:
# rating이 5개 이상인 유저로 필터링된 데이터에서 이미지 url이 존재하는 아이템이 있는지 메타데이터에서 동일한 값 찾기
common_values = np.intersect1d(df_meta['asin'].unique(), filtered_df['asin'].unique())

# 동일한 값의 개수 확인
num_common_values = len(common_values)

# 결과 출력
print("두 배열에서 동일한 값:", common_values)
print("동일한 값의 개수:", num_common_values)

두 배열에서 동일한 값: ['7106116521' 'B00008JPRZ' 'B00012O2RY' ... 'B01HJEOBUO' 'B01HJG5NLI'
 'B01HJGJ9LS']
동일한 값의 개수: 10681


In [11]:
# 메타데이터에서 rating데이터와 공통으로 존재하는 아이템을 필터링
df_meta_filtered = df_meta[df_meta['asin'].isin(common_values)]

In [12]:
#merge inner
df_meta_rating = pd.merge(df_meta_filtered, filtered_df)
df_meta_rating

Unnamed: 0,asin,title,imageURLHighRes,reviewerID,rating,unixReviewTime
0,7106116521,Milliongadgets(TM) Earring Safety Backs For Fi...,[https://images-na.ssl-images-amazon.com/image...,AD0OENWU7N4L6,5.0,1437436800
1,B00008JPRZ,Paul Fredrick Men's Pinpoint Snap Tab Collar B...,[https://images-na.ssl-images-amazon.com/image...,A281NL3AO4QIGE,5.0,1508630400
2,B00012O2RY,Nike Presto Collection Women's Watch White Dia...,[https://images-na.ssl-images-amazon.com/image...,A3PSH91YKGP4IV,3.0,1288828800
3,B0001F331C,Baggallini Currency Organizer - Nylon (Black ),[https://images-na.ssl-images-amazon.com/image...,A2M1IZTVL7TMC0,5.0,1352937600
4,B00008JPRZ,Paul Fredrick Men's Pinpoint Snap Tab Collar B...,[https://images-na.ssl-images-amazon.com/image...,A281NL3AO4QIGE,5.0,1508630400
...,...,...,...,...,...,...
20776,B01HJ0SIF2,Ankle Strap High Heel - Trendy Block Heel Pump...,[https://images-na.ssl-images-amazon.com/image...,A1PP2CWYBMLYV2,5.0,1518480000
20777,B01HJEO9SS,ROMWE Women's Bohemian Short Sleeve V neck Lon...,[https://images-na.ssl-images-amazon.com/image...,A2E01AZPAB016,1.0,1474934400
20778,B01HJEOBUO,ROMWE Women's Bohemian Short Sleeve V neck Lon...,[https://images-na.ssl-images-amazon.com/image...,A21DXQFAP5TTHY,5.0,1471219200
20779,B01HJG5NLI,Microcosm Retro Vintage Black Crochet Lace One...,[https://images-na.ssl-images-amazon.com/image...,A2HOQJXDFZP9Y7,4.0,1470182400


In [13]:
# Nan값 확인
df_meta_rating.isna().sum()

asin               0
title              0
imageURLHighRes    0
reviewerID         0
rating             0
unixReviewTime     0
dtype: int64

In [14]:
df_meta_rating['reviewerID'].value_counts()

reviewerID
A3G5KDMFNRUXHB    37
A3JBQHQZEZPQK4    34
A1RRX286ZRI830    27
A32M3PMH6DSLKD    25
A1J7RPYGVGH5O3    25
                  ..
A394FIYOOFOMVG     1
A2XS12X6E3J5XW     1
A3E2AGCF3MH00A     1
A3B2PAWZRFXAHX     1
A1XIW8WIJC6N16     1
Name: count, Length: 3712, dtype: int64

In [15]:
# df_meta_rating merge 한 것에서 다시 reviewerID가 5 이상인 것들로 필터링
df_meta_rating_counts = df_meta_rating['reviewerID'].value_counts()
df_meta_rating_filter = df_meta_rating_counts[df_meta_rating_counts >=5].index
filtered_df_meta_rating = df_meta_rating[df_meta_rating['reviewerID'].isin(df_meta_rating_filter)]

## 데이터셋

In [16]:
print("상품 unique : ", filtered_df_meta_rating['asin'].nunique())
print("리뷰어 unique : ", filtered_df_meta_rating['reviewerID'].nunique())
print(f"sparsity :  {1-len(filtered_df_meta_rating) / (filtered_df_meta_rating['asin'].nunique()*filtered_df_meta_rating['reviewerID'].nunique()):.15f}")

상품 unique :  8587
리뷰어 unique :  2578
sparsity :  0.999241189728497


In [17]:
filtered_df_meta_rating

Unnamed: 0,asin,title,imageURLHighRes,reviewerID,rating,unixReviewTime
0,7106116521,Milliongadgets(TM) Earring Safety Backs For Fi...,[https://images-na.ssl-images-amazon.com/image...,AD0OENWU7N4L6,5.0,1437436800
1,B00008JPRZ,Paul Fredrick Men's Pinpoint Snap Tab Collar B...,[https://images-na.ssl-images-amazon.com/image...,A281NL3AO4QIGE,5.0,1508630400
2,B00012O2RY,Nike Presto Collection Women's Watch White Dia...,[https://images-na.ssl-images-amazon.com/image...,A3PSH91YKGP4IV,3.0,1288828800
3,B0001F331C,Baggallini Currency Organizer - Nylon (Black ),[https://images-na.ssl-images-amazon.com/image...,A2M1IZTVL7TMC0,5.0,1352937600
4,B00008JPRZ,Paul Fredrick Men's Pinpoint Snap Tab Collar B...,[https://images-na.ssl-images-amazon.com/image...,A281NL3AO4QIGE,5.0,1508630400
...,...,...,...,...,...,...
20776,B01HJ0SIF2,Ankle Strap High Heel - Trendy Block Heel Pump...,[https://images-na.ssl-images-amazon.com/image...,A1PP2CWYBMLYV2,5.0,1518480000
20777,B01HJEO9SS,ROMWE Women's Bohemian Short Sleeve V neck Lon...,[https://images-na.ssl-images-amazon.com/image...,A2E01AZPAB016,1.0,1474934400
20778,B01HJEOBUO,ROMWE Women's Bohemian Short Sleeve V neck Lon...,[https://images-na.ssl-images-amazon.com/image...,A21DXQFAP5TTHY,5.0,1471219200
20779,B01HJG5NLI,Microcosm Retro Vintage Black Crochet Lace One...,[https://images-na.ssl-images-amazon.com/image...,A2HOQJXDFZP9Y7,4.0,1470182400


In [67]:
df_image= filtered_df_meta_rating[['asin','imageURLHighRes']].drop_duplicates(subset='asin')

### 아마존 이미지 저장

In [77]:
import requests
import os
import pandas as pd


# 이미지를 저장할 폴더 지정
folder_name = 'images'

# 폴더가 없다면 생성
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# 각 행에 대해 반복
for index, row in df_image.iterrows():
    # imageURLHighRes 열과 asin 열의 값을 가져옴
    url = row['imageURLHighRes'][0]
    asin = row['asin']

    # 파일 확장자를 URL에서 추출 (예: .jpg, .png)
    extension = os.path.splitext(url)[1]

    # 파일명을 'asin' 값과 확장자로 설정
    file_name = f"{asin}{extension}"

    # 이미지를 저장할 전체 경로 생성
    file_path = os.path.join(folder_name, file_name)

    # 이미지 다운로드 및 저장
    response = requests.get(url)
    if response.status_code == 200:
        with open(file_path, 'wb') as f:
            f.write(response.content)

### 네이버 이미지 저장

In [79]:
#네이버 데이터 불러오기
df_gift = pd.read_csv('gift_data.csv')
df_gift['category'] = df_gift.apply(lambda x: x['category_2'] if pd.isna(x['category_3']) else x['category_3'], axis=1)

  df_gift = pd.read_csv('gift_data.csv')


In [84]:
df_gift_fashion=df_gift[df_gift['category'].isin(['언더웨어/홈웨어', '신생아의류', '패션의류/잡화'])]

In [96]:
import requests
import os
import pandas as pd

# 이미지를 저장할 폴더 지정
folder_name = 'images_naver_fashion'

# 폴더가 없다면 생성
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# 각 행에 대해 반복
for index, row in df_gift_fashion.iterrows():
    # imageURLHighRes 열과 asin 열의 값을 가져옴
    url = row['image_url']
    asin = row['product_id']

    # 파일 확장자를 URL에서 추출 (예: .jpg, .png)
    #extension = os.path.splitext(url)[1]
    
    file_name = url.split('/')[-1].split('?')[0]

    # 파일명을 'asin' 값과 확장자로 설정
    file_name = f"{asin}{file_name}"

    # 이미지를 저장할 전체 경로 생성
    file_path = os.path.join(folder_name, file_name)

    # 이미지 다운로드 및 저장
    response = requests.get(url)
    if response.status_code == 200:
        with open(file_path, 'wb') as f:
            f.write(response.content)

### FAISS를 통한 가장 유사한 상품 찾기

In [107]:
import os
import faiss
import numpy as np
import torch
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image


np.random.seed(1)

# 이미지를 텐서로 변환하는 함수
def image_to_tensor(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image = Image.open(image_path).convert('RGB')
    image_tensor = transform(image).unsqueeze(0)
    return image_tensor

# 특징 벡터 추출 함수
def extract_features(image_folder):
    model = resnet50(pretrained=True)
    model.eval()
    features = []
    image_paths = []

    for img_filename in os.listdir(image_folder):
        img_path = os.path.join(image_folder, img_filename)
        if not img_path.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue
        image_tensor = image_to_tensor(img_path)
        with torch.no_grad():
            feature = model(image_tensor)
        features.append(feature[0].numpy().flatten())
        image_paths.append(img_path)

    return np.array(features), image_paths

# A 폴더와 B 폴더에서 이미지 특징 추출
features_A, paths_A = extract_features('images_naver_fashion')
features_B, paths_B = extract_features('images')

# FAISS 인덱스 생성 및 훈련
index = faiss.IndexFlatL2(features_B.shape[1])
index.add(features_B)

'''
# A 폴더의 각 이미지에 대해 가장 유사한 B 폴더의 이미지 찾기
D, I = index.search(features_A, k=1)  # k=1은 가장 가까운 이웃 1개 찾기

for i, idx in enumerate(I):
    print(f"A 폴더의 이미지 {paths_A[i]}는 B 폴더의 이미지 {paths_B[idx[0]]}와 가장 유사합니다.")
'''    
    
# A 폴더의 각 이미지에 대해 상위 5개의 가장 유사한 B 폴더의 이미지 찾기
k = 5  # 상위 k개의 유사한 이미지를 찾습니다.
D, I = index.search(features_A, k=k)

# 결과를 저장할 빈 리스트 초기화
results = []

# 각 A 폴더 이미지에 대한 상위 k개 유사 이미지 및 거리(유사도) 저장
for i, indices in enumerate(I):
    distances = D[i]
    for rank, (index, distance) in enumerate(zip(indices, distances)):
        results.append({
            'Naver_image': paths_A[i],
            'Amazon_image': paths_B[index],
            'similarity': 1 / (1 + distance),  
            'rank': rank + 1  # 순위 (1~5)
        })

# 결과를 데이터 프레임으로 변환
results_df = pd.DataFrame(results)

# 결과 출력
print(results_df)



                                            Naver_image  \
0     images_naver_fashion\1001355052912710587649172...   
1     images_naver_fashion\1001355052912710587649172...   
2     images_naver_fashion\1001355052912710587649172...   
3     images_naver_fashion\1001355052912710587649172...   
4     images_naver_fashion\1001355052912710587649172...   
...                                                 ...   
5800  images_naver_fashion\9931421784253931421832813...   
5801  images_naver_fashion\9931421784253931421832813...   
5802  images_naver_fashion\9931421784253931421832813...   
5803  images_naver_fashion\9931421784253931421832813...   
5804  images_naver_fashion\9931421784253931421832813...   

               Amazon_image  similarity  rank  
0     images\B01DKAGUA0.jpg    0.000329     1  
1     images\B013ULNDG8.jpg    0.000325     2  
2     images\B017QSZHMY.jpg    0.000313     3  
3     images\B004F1GSWY.jpg    0.000310     4  
4     images\B00NNUH1TW.jpg    0.000299     5  
...

In [113]:
results_df['Naver_image'] = results_df['Naver_image'].str.replace('images_naver_fashion\\', '', regex=False)
results_df['Amazon_image'] = results_df['Amazon_image'].str.replace('images\\', '', regex=False)


In [120]:
results_df  ### 아직 원본 데이터 셋과 merge를 안한 상태입니다.

Unnamed: 0,Naver_image,Amazon_image,similarity,rank
0,10013550529127105876491720_1440235704.jpg,B01DKAGUA0.jpg,0.000329,1
1,10013550529127105876491720_1440235704.jpg,B013ULNDG8.jpg,0.000325,2
2,10013550529127105876491720_1440235704.jpg,B017QSZHMY.jpg,0.000313,3
3,10013550529127105876491720_1440235704.jpg,B004F1GSWY.jpg,0.000310,4
4,10013550529127105876491720_1440235704.jpg,B00NNUH1TW.jpg,0.000299,5
...,...,...,...,...
5800,993142178425393142183281329_1644392658.jpg,B00LEOT2C8.jpg,0.000292,1
5801,993142178425393142183281329_1644392658.jpg,B00ASLJXMC.jpg,0.000282,2
5802,993142178425393142183281329_1644392658.jpg,B01GUAGRFK.jpg,0.000272,3
5803,993142178425393142183281329_1644392658.jpg,B01GUBMNO8.jpg,0.000272,4
