# 연관 분석 기반 추천 목록 생성
- 지지도 Support
    - 위 데이터에서 맥주의 지지도 4/10 (맥주 4개 포함됨)
- 규칙 지지도 : 조건절 → 결과절이 같이 가는 규칙 지지도
    - 위 데이터에서 맥주를 사면 두루마리도 사는 규칙의 지지도 2/10
- 신뢰도 : 조건절 상황 하에 결과절 일어날 비율 (조건부확률 P(결과절|조건절))
    - 위 데이터에서 규칙 지지도 / 지지도 = 2/4
- 향상도 : A와의 관계가 고려되어 규칙이 성립되는 경우
    - - Lift A→B : 신뢰도 / 결과절의 지지도
    = P(결과절|조건절) / P(결과절) = 규칙 지지도 / (조건절 지지도 * 결과절 지지도)
    - 참고 : 1보다 커야 좋다

## 1. 데이터 셋 준비

### 데이터 불러오기

In [22]:
from google.colab import drive
drive.mount('/content/drive')

# csv 불러오기
import pandas as pd

movie_df = pd.read_csv('/content/drive/MyDrive/2307_추천시스템_101/movie_df.csv', index_col=0)
user_df = pd.read_csv('/content/drive/MyDrive/2307_추천시스템_101/user_df.csv', index_col=0)

rating_url = 'https://raw.githubusercontent.com/yoonkt200/python-data-analysis/master/data/ml-1m/ratings.dat'
rating_df = pd.read_csv(rating_url, names=['user_id', 'movie_id', 'rating', 'time'], delimiter='::', engine ='python')
rating_df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


### 장바구니 데이터 생성

* 유저가 영화를 평가한 시간을 순서대로 나열

In [2]:
sort_rating_df = rating_df.sort_values(['user_id', 'time'], ascending=True)
sort_rating_df.head()

Unnamed: 0,user_id,movie_id,rating,time
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103


* 2차원 배열의 데이터셋 (한 유저의 시청 리스트를 시간 순서대로 표현)

In [3]:
cart_dataset = list(sort_rating_df.groupby('user_id')['movie_id'].apply(list))

In [4]:
len(cart_dataset), len(cart_dataset[0])

(6040, 53)

## 2. 연관 분석 모델 사용

### 모델 학습

In [5]:
!pip install apriori apyori

Collecting apriori
  Downloading apriori-1.0.0.tar.gz (1.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting apyori
  Downloading apyori-1.1.2.tar.gz (8.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: apriori, apyori
  Building wheel for apriori (setup.py) ... [?25l[?25hdone
  Created wheel for apriori: filename=apriori-1.0.0-py3-none-any.whl size=2454 sha256=665ff1c7048d80ecbb49f2bdd7c2da8926f5ccc35d03fd285544aa9044423b7d
  Stored in directory: /root/.cache/pip/wheels/8c/fa/83/25b9cb17d884f97f2e62d97d0818bbed8117e89a6b09c37dc3
  Building wheel for apyori (setup.py) ... [?25l[?25hdone
  Created wheel for apyori: filename=apyori-1.1.2-py3-none-any.whl size=5954 sha256=08908507d5eb09039b2c754d1137c8525b9bba90f3ad2b2f5d1ca16ceec7f480
  Stored in directory: /root/.cache/pip/wheels/c4/1a/79/20f55c470a50bb3702a8cb7c94d8ada15573538c7f4baebe2d
Successfully built apriori apyori
Installing collected packages: apyori, aprior

In [6]:
from apyori import apriori #연관 분석을 확률 기반으로 가지치기 하는 라이브러리

In [7]:
sample_transactions = [
    ['손흥민', '시소코'],
    ['손흥민', '케인'],
    ['손흥민', '케인', '포체티노']
]

sample_result = apriori(sample_transactions)

In [8]:
sample_result

<generator object apriori at 0x7cf9e0297530>

In [9]:
list(sample_result)

[RelationRecord(items=frozenset({'손흥민'}), support=1.0, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'손흥민'}), confidence=1.0, lift=1.0)]),
 RelationRecord(items=frozenset({'시소코'}), support=0.3333333333333333, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'시소코'}), confidence=0.3333333333333333, lift=1.0)]),
 RelationRecord(items=frozenset({'케인'}), support=0.6666666666666666, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'케인'}), confidence=0.6666666666666666, lift=1.0)]),
 RelationRecord(items=frozenset({'포체티노'}), support=0.3333333333333333, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'포체티노'}), confidence=0.3333333333333333, lift=1.0)]),
 RelationRecord(items=frozenset({'손흥민', '시소코'}), support=0.3333333333333333, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'손흥민', '시소코'}), confidence=0.3333333333333333, lift

In [10]:
sample_result_2 = apriori(sample_transactions,
                          min_support=0.5, # 지지도 threshold
                          min_confidence=0.6, # 신뢰도 threshold
                          min_lift=1.0, # 향상도 threshold
                          max_length=2) # 규칙의 크기가 2 이하

In [11]:
sample_result_list = list(sample_result_2)
sample_result_list[0]

RelationRecord(items=frozenset({'손흥민'}), support=1.0, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'손흥민'}), confidence=1.0, lift=1.0)])

### 영화별 연관 목록 생성

In [12]:
cart_dataset

[[3186,
  1270,
  1721,
  1022,
  2340,
  1836,
  3408,
  2804,
  1207,
  1193,
  720,
  260,
  919,
  608,
  2692,
  1961,
  2028,
  3105,
  938,
  1035,
  1962,
  2018,
  150,
  1028,
  1097,
  914,
  1287,
  2797,
  2762,
  1246,
  661,
  2918,
  531,
  3114,
  2791,
  2321,
  1029,
  1197,
  594,
  2398,
  1545,
  527,
  595,
  2687,
  745,
  588,
  1,
  2355,
  2294,
  783,
  1566,
  1907,
  48],
 [1198,
  1210,
  1217,
  2717,
  1293,
  2943,
  1225,
  1193,
  318,
  3030,
  2858,
  1213,
  1945,
  1207,
  593,
  3095,
  3468,
  1873,
  515,
  1090,
  2501,
  3035,
  110,
  2067,
  3147,
  1247,
  3105,
  1357,
  1196,
  1957,
  1953,
  920,
  1834,
  1084,
  1962,
  3471,
  3654,
  3735,
  1259,
  1954,
  1784,
  2728,
  1968,
  1103,
  902,
  3451,
  3578,
  2852,
  3334,
  3068,
  265,
  2312,
  590,
  1253,
  3071,
  1244,
  3699,
  1955,
  1245,
  2236,
  3678,
  982,
  2194,
  2268,
  1442,
  3255,
  647,
  235,
  1096,
  1124,
  498,
  1246,
  3893,
  1537,
  1188,
  2396,

In [13]:
results = list(apriori(cart_dataset,
                      min_support=0.005, # 지지도 threshold
                      min_confidence=0.01, # 신뢰도 threshold
                      min_lift=1.5, # 향상도 threshold
                      max_length=2)) # 규칙의 크기가 2 이하
association_dict = {}

In [14]:
len(results)

1448238

In [15]:
for result in results:
  if len(result.items) == 2:
    items = [x for x in result.items]
    source = items[0]
    target = items[1]
    support = result.support

    if source not in association_dict:
      association_dict[source] = {}
      association_dict[source][target] = support
    else:
      if target not in association_dict[source]:
        association_dict[source][target] = support

In [16]:
association_dict

Output hidden; open in https://colab.research.google.com to view.

In [17]:
sort_association_dict = {}
for idx, ad in association_dict.items():
  sort_association_list = sorted(ad.items(), key=(lambda x: x[1]), reverse=True)
  sort_association_dict[idx] = [sa[0] for sa in list(sort_association_list[:5])]

In [20]:
sort_association_dict

{1: [1265, 2396, 356, 1197, 318],
 8: [1, 2628, 34, 480, 1097],
 16: [593, 608, 296, 2028, 1617],
 32: [2571, 1196, 589, 260, 1580],
 48: [588, 364, 595, 1, 1580],
 64: [356, 39, 1580, 539, 1265],
 65: [1580, 356, 1265, 2683, 260],
 72: [2858, 1265, 608, 296, 2997],
 73: [527, 318, 2028, 110, 2396],
 81: [50, 1196, 1213, 356, 2028],
 88: [356, 1580, 1265, 1923, 2683],
 89: [457, 589, 1580, 2571, 377],
 104: [1265, 1580, 356, 1196, 1],
 112: [2571, 1580, 480, 589, 1196],
 144: [2858, 1580, 480, 110, 589],
 160: [1580, 480, 2571, 589, 780],
 168: [1580, 480, 110, 1196, 2571],
 176: [2858, 2997, 1265, 1617, 608],
 208: [480, 1580, 1196, 648, 2571],
 216: [1580, 356, 1265, 2571, 2683],
 224: [1265, 356, 2396, 1580, 608],
 232: [2858, 608, 1265, 2396, 296],
 248: [1265, 356, 1580, 3255, 1923],
 256: [1580, 480, 780, 2571, 589],
 272: [608, 593, 296, 1196, 527],
 280: [593, 608, 457, 318, 589],
 288: [2571, 589, 1580, 480, 2916],
 296: [608, 593, 318, 2028, 1617],
 312: [1580, 1265, 1270, 11

## 3. 추천 목록에 추가하기

In [23]:
import pickle

In [24]:
with open('/content/drive/MyDrive/2307_추천시스템_101/user_rec_dict.pickle', 'rb') as f:
  user_rec_dict = pickle.load(f)

In [25]:
user_rec_dict

Output hidden; open in https://colab.research.google.com to view.

### 유저별 최근 시청목록 추출

In [26]:
split_bound = rating_df['time'].quantile(q=0.8)
train_df = rating_df[rating_df['time'] < split_bound]

In [27]:
user_watch_list_series = train_df.sort_values(['user_id', 'time'], ascending=False).groupby('user_id')['movie_id'].apply(list)

In [28]:
user_watch_list_series

user_id
635     [1884, 2303, 3198, 2966, 2240, 858, 1952, 1203...
636     [1690, 1391, 3156, 379, 1603, 1876, 3698, 1320...
637     [1379, 1278, 1378, 2919, 919, 2701, 1073, 2987...
638     [3863, 2261, 3684, 2193, 1587, 2005, 2111, 213...
639     [2987, 3114, 3157, 2762, 2093, 2670, 2394, 201...
                              ...                        
6036    [2807, 1862, 3574, 2643, 2091, 3401, 3573, 405...
6037    [968, 435, 2527, 2641, 1019, 2407, 2640, 2363,...
6038    [1183, 2700, 1419, 1354, 1296, 1223, 1136, 308...
6039    [912, 922, 1204, 1254, 1148, 858, 923, 1252, 2...
6040    [3168, 1077, 495, 3751, 1273, 535, 1674, 373, ...
Name: movie_id, Length: 5400, dtype: object

### 최근 시청 목록의 연관 추천

In [29]:
sort_association_dict

{1: [1265, 2396, 356, 1197, 318],
 8: [1, 2628, 34, 480, 1097],
 16: [593, 608, 296, 2028, 1617],
 32: [2571, 1196, 589, 260, 1580],
 48: [588, 364, 595, 1, 1580],
 64: [356, 39, 1580, 539, 1265],
 65: [1580, 356, 1265, 2683, 260],
 72: [2858, 1265, 608, 296, 2997],
 73: [527, 318, 2028, 110, 2396],
 81: [50, 1196, 1213, 356, 2028],
 88: [356, 1580, 1265, 1923, 2683],
 89: [457, 589, 1580, 2571, 377],
 104: [1265, 1580, 356, 1196, 1],
 112: [2571, 1580, 480, 589, 1196],
 144: [2858, 1580, 480, 110, 589],
 160: [1580, 480, 2571, 589, 780],
 168: [1580, 480, 110, 1196, 2571],
 176: [2858, 2997, 1265, 1617, 608],
 208: [480, 1580, 1196, 648, 2571],
 216: [1580, 356, 1265, 2571, 2683],
 224: [1265, 356, 2396, 1580, 608],
 232: [2858, 608, 1265, 2396, 296],
 248: [1265, 356, 1580, 3255, 1923],
 256: [1580, 480, 780, 2571, 589],
 272: [608, 593, 296, 1196, 527],
 280: [593, 608, 457, 318, 589],
 288: [2571, 589, 1580, 480, 2916],
 296: [608, 593, 318, 2028, 1617],
 312: [1580, 1265, 1270, 11

In [31]:
for user, values in user_watch_list_series.iteritems():
  if user in user_rec_dict:
    if len(values) > 4:
      watch_list = values[:5]
    else:
      watch_list = values

  recommendation_list = []
  for source in watch_list:
    if source in sort_association_dict.keys():
      recommendation_list.extend(sort_association_dict[source])

  user_rec_dict[user]['recent_watch_association_recommendations'] = recommendation_list
  user_rec_dict[user]['total_recommendations'] = set(list(user_rec_dict[user]['total_recommendations']) + recommendation_list)


  for user, values in user_watch_list_series.iteritems():


In [32]:
user_rec_dict

Output hidden; open in https://colab.research.google.com to view.

### 추천 목록 저장

In [33]:
recent_output_path = '/content/drive/MyDrive/2307_추천시스템_101/user_rec_dict_recent.pickle'

with open(recent_output_path, 'wb') as f:
  pickle.dump(user_rec_dict, f)