In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
#전처리된 리뷰데이터Json파일 데이터프레임으로 불러오기
review_data = pd.read_json('preprocessed_review_data.json')

In [22]:
prod_data = pd.read_json('preprocessed_prod_data.json')

In [4]:
#상품 리스트
prod_id_ls = list(review_data['prod_idx'].unique())
len(prod_id_ls)

9834

In [5]:
#유저 리스트
member_id_ls = list(review_data['memberSrl'].unique())
len(member_id_ls)

58322

In [6]:
#상품X유저 데이터프레임 생성(index = 상품ID, columns = 유저ID)
df = pd.DataFrame(index=prod_id_ls, columns=member_id_ls)

In [7]:
#상품X유저 데이터프레임에 유저별 상품만족도 삽입
for mem_id in member_id_ls :
    is_memberSrl = review_data['memberSrl'] == mem_id
    mem_prod_ls = list(review_data[is_memberSrl]['prod_idx'])
    mem_satis_ls = list(review_data[is_memberSrl]['prod_satis'])
    for i,j in zip(mem_prod_ls,mem_satis_ls) :
        df[mem_id][i] = j

In [8]:
df = df.fillna(0)

In [9]:
#생성된 데이터프레임 df를 통해 모델링 진행 
df

Unnamed: 0,101994422,5016629,298123,80235,17873545,70477358,21627189,20400529,10205573,2707693,...,61043278,64185254,12447653,376355,35356381,73024842,5851257,41814697,40265865,49325577
2865723582,5,5,5,5,5,5,4,4,5,5,...,0,0,0,0,0,0,0,0,0,0
512851998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
760761146,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
212815213,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3030712126,0,0,0,0,0,0,4,0,0,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764649526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3518409054,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3173194022,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4292984610,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 특정 상품 기반 추천

In [11]:
#상품 기반 cosine 유사도 파악
prod_sim = cosine_similarity(df, df)
print(prod_sim.shape)

(9834, 9834)


In [12]:
prod_sim_df = pd.DataFrame(data = prod_sim, index = df.index, columns = df.index)

In [13]:
#특정 상품과 유사한 상품 추천
recommend_prod_list = prod_sim_df[2865723582].sort_values(ascending=False)[1:10]

3030712126    0.143433
3223462394    0.109907
4125298018    0.096968
2669973758    0.093817
2976219878    0.081475
638703986     0.080751
2796301094    0.080009
2961545794    0.077265
3014562378    0.077000
Name: 2865723582, dtype: float64

In [21]:
recommend_prod_list = list(prod_sim_df[2865723582].sort_values(ascending=False)[1:10].index)

In [23]:
recommend_prod_titles = prod_data[prod_data['prod_idx'].isin(recommend_prod_list)]['title']
recommend_prod_titles

4        [봉팔형님] 밤꿀고구마 중 1kg 외 꿀고구마 4종 크기별 모음 / 2개 구매시 3...
5             [컬러푸드] 봉팔형님 국내산 햇 밤꿀고구마 & 호박고구마 3kg 5kg 10kg
9          [퍼스트위크] 티몬블랙딜 첫수확 정품 해남 햇 꿀밤고구마 한입 2kg / 특상 1kg
1207     [퍼스트위크] 티몬블랙딜 오쿡 닭가슴살 1+1+1+1 외 33종 2900원 균일가 ...
1217             흘러넘치는 고구마 왕돈까스 200g빅사이즈 트리플치즈/프리미엄/통등심1등급
7355      [참미] 총각김치2kg + 열무1kg 외 베스트 모음 / 2건이상 구매시반찬(랜덤증정)
14659             [퍼스트위크] 히트상품 [신한수산] 20%선착순쿠폰+국내산 왕새우 1kg
17987      [퍼스트위크] 히트상품 국가대표 스낵 990원 균일가 골라담기/라면/음료/젤리/대용식
18002    [퍼스트위크] 티몬블랙딜 따끈한 꿀백설기 21개외 노마진세일+2세트/3세트이상구매시...
Name: title, dtype: object

### 개인 추천

In [26]:
df2 = df.transpose()

In [27]:
df2

Unnamed: 0,2865723582,512851998,760761146,212815213,3030712126,638703986,1428676362,1804864666,412462214,4125298018,...,2773057954,4244345722,3840159202,2323253942,3450032938,2764649526,3518409054,3173194022,4292984610,3344589718
101994422,5,0,0,0,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5016629,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
298123,5,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
80235,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17873545,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73024842,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5851257,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41814697,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40265865,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
#유저 기반 cosine 유사도 파악
user_sim = cosine_similarity(df2, df2)
print(user_sim.shape)

MemoryError: Unable to allocate 25.3 GiB for an array with shape (58322, 58322) and data type float64

In [None]:
#유저 수 줄여야함.

In [None]:
user_sim_df = pd.DataFrame(data = user_sim, index = df2.index, columns = df2.index)

In [None]:
recommend_user_list = list(user_sim_df[101994422].sort_values(ascending=False)[1:10].index)

In [None]:
#유저리스트 목록 뽑고 가장 상관계수가 높은 유저가 산 제품 중 타겟 고객이 사지 않은 제품들 추천.