# Item-base 實作

In [52]:
import pandas as pd
import numpy as np
import numpy.matlib 
import random
import sys
from sklearn.metrics.pairwise import cosine_similarity

# 輸入參數內容
# 1~968
target_user = 169
# 1~5
like_number_threshold = 3 
# 15~25
compare_movie_threshold = 25
# 5~15
amount_of_recommended_movies = 10

# 資料載入
data_header = ["user_id", "item_id", "rating", "timestamp"]
data_pd = pd.read_csv("u.data.data", sep = '\t', header = None, names=data_header)

movie_header = ["item_id", "title", "release_date", "video_release_date", "IMDb_URL",
         "unknown", "Action", "Adventure", "Animation","Children's", "Comedy", "Crime",
         "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", 
         "Romance", "Sci-Fi", "Thriller", "War", "Western"]
movies = pd.read_csv('u.item.item', sep = '|', header = None, encoding = 'latin1', names = movie_header)

print('資料載入完成')

# 選出符合條件的電影
condition1 = data_pd['user_id'] == target_user
condition2 = data_pd['rating'] > like_number_threshold
user_like_pd = data_pd[condition1 & condition2]

# 假如使用者沒有任何一部電影資料就回覆 空陣列
if(len(user_like_pd) == 0):
    sys.exit('強制結束');

# print('選出符合條件的電影完成', user_like_pd)

def output_matrix(data_pd):
    # user-item martix 建構
    user_id_max = data_pd['user_id'].max()
    item_id_max = data_pd['item_id'].max()

    rank_matrix = np.matlib.zeros((item_id_max, user_id_max))

    # 對應 rating 填表
    for i in range(len(data_pd)):
        data = data_pd.iloc[i]
        # id 都是從 1 開始所以要減一，才會對應到    
        rank_matrix[data['item_id'] - 1, data['user_id'] - 1] = data['rating']

    print('rating matrix 填表完成')

    # movie_cos_sim
    movie_cos_sim = cosine_similarity(rank_matrix)

    print('cosine similarity 完成')
    
    return movie_cos_sim

movie_cos_sim = output_matrix(data_pd)

def sum_movie_cosine_similarity(user_like_pd, movie_cos_sim):
    pd_count_number = len(user_like_pd)
    print('使用者喜愛的電影數量: ', pd_count_number)

    total_movie_cosine_similarity = np.matlib.zeros((1, item_id_max))
    
    # 假如使用者少於 compare_movie_threshold 部，就利用已有電影資料去加總
    if(pd_count_number < compare_movie_threshold):
        print('使用全部加總取得所有電影相似值')
        for i in range(0, pd_count_number - 1):
            _item_id = user_like_pd.iloc[i]['item_id']
            total_movie_cosine_similarity += movie_cos_sim[_item_id - 1] # 減一才是該對影對應的值


    # 假如使用者有大於等於 compare_movie_threshold 部電影資料，隨機挑選 compare_movie_threshold 部加總
    if(pd_count_number >= compare_movie_threshold):
        print('使用隨機加總取得所有電影相似值')
        for i in range(0, compare_movie_threshold):
            radom_number = random.randint(0,pd_count_number - 1)
            _item_id = user_like_pd.iloc[radom_number]['item_id']
            print('item_id:', _item_id, movie_cos_sim[_item_id].sum())
            total_movie_cosine_similarity += movie_cos_sim[_item_id - 1] # 減一才是該對影對應的值

    total_movie_cosine_similarity_pd = pd.DataFrame(total_movie_cosine_similarity[0])
    total_movie_cosine_similarity_pd = total_movie_cosine_similarity_pd.T
    return total_movie_cosine_similarity_pd


total_movie_cosine_similarity_pd = sum_movie_cosine_similarity(user_like_pd, movie_cos_sim)

print('找到使用者喜愛電影相似度加總')

def movie_cos_without_user_like(user_like_pd, total_movie_cosine_similarity_pd):
    # 將看到的影片相似度轉為 0
    user_like_item_list = user_like_pd['item_id'].values
    # 轉為 index 位置
    user_like_item_list = user_like_item_list -1

    for item_index in user_like_item_list:
        total_movie_cosine_similarity_pd[0][item_index] = 0
    
    # 只取得 0 以上的內容
    movie_similarity_pd = total_movie_cosine_similarity_pd[total_movie_cosine_similarity_pd > 0]
    movie_similarity_pd = movie_similarity_pd.dropna()
    
    return movie_similarity_pd

clear_movie_cos = movie_cos_without_user_like(user_like_pd, total_movie_cosine_similarity_pd)

# 排名高到低
clear_movie_cos = clear_movie_cos.sort_values(by=0, ascending=False)

print('將使用者喜愛影片轉為0，並只保留0以上的值')

# print('clear_movie_cos: \n', clear_movie_cos)

top_movies = movies.filter(clear_movie_cos[0:amount_of_recommended_movies].index.tolist(), axis=0)

print('取得前 amount_of_recommended_movies 部推薦電影')

# print('top_movies: \n', top_movies)

recommands = pd.merge(top_movies, clear_movie_cos, left_index=True, right_index=True)
recommands = recommands.rename(columns={0: 'cosine_similarity'})
recommands.filter(['cosine_similarity', 'title'])


資料載入完成
rating matrix 填表完成
cosine similarity 完成
使用者喜愛的電影數量:  24
使用全部加總取得所有電影相似值
找到使用者喜愛電影相似度加總
將使用者喜愛影片轉為0，並只保留0以上的值
取得前 amount_of_recommended_movies 部推薦電影


Unnamed: 0,cosine_similarity,title
203,10.819339,Back to the Future (1985)
97,10.783065,"Silence of the Lambs, The (1991)"
78,10.607759,"Fugitive, The (1993)"
55,10.37448,Pulp Fiction (1994)
209,10.369905,Indiana Jones and the Last Crusade (1989)
422,10.328321,E.T. the Extra-Terrestrial (1982)
194,10.309506,"Terminator, The (1984)"
68,10.197102,Forrest Gump (1994)
182,10.184262,Alien (1979)
131,10.138235,"Wizard of Oz, The (1939)"


#  AI 修改版本

In [6]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics.pairwise import cosine_similarity

# 輸入參數內容
# 1~968
target_user = 169
# 1~5
like_number_threshold = 3 
# 15~25
compare_movie_threshold = 25
# 5~15
amount_of_recommended_movies = 10

# 資料載入
data_header = ["user_id", "item_id", "rating", "timestamp"]
data_pd = pd.read_csv("u.data.data", sep = '\t', header = None, names=data_header)

movie_header = ["item_id", "title", "release_date", "video_release_date", "IMDb_URL",
         "unknown", "Action", "Adventure", "Animation","Children's", "Comedy", "Crime",
         "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", 
         "Romance", "Sci-Fi", "Thriller", "War", "Western"]
movies = pd.read_csv('u.item.item', sep = '|', header = None, encoding = 'latin1', names = movie_header)

print('資料載入完成')

# 選出符合條件的電影
condition1 = data_pd['user_id'] == target_user
condition2 = data_pd['rating'] > like_number_threshold
user_like_pd = data_pd[condition1 & condition2]

# 假如使用者沒有任何一部電影資料就終止程式
if user_like_pd.empty:
    sys.exit('強制結束')

print('選出符合條件的電影完成', user_like_pd)

# user-item martix 建構
user_id_max = data_pd['user_id'].max()
item_id_max = data_pd['item_id'].max()

rank_matrix = np.zeros((item_id_max, user_id_max))

# 對應 rating 填表
for i in range(len(data_pd)):
    data = data_pd.iloc[i]
    # id 都是從 1 開始所以要減一，才會對應到    
    rank_matrix[data['item_id'] - 1, data['user_id'] - 1] = data['rating']

print('rating matrix 填表完成')

# movie_cos_sim
movie_cos_sim = cosine_similarity(rank_matrix)

print('cosine similarity 完成')

# 加總使用者喜愛的電影的相似度
pd_count_number = len(user_like_pd)
print('使用者喜愛的電影數量: ', pd_count_number)

total_movie_cosine_similarity = np.zeros((1, item_id_max))

# 假如使用者少於 compare_movie_threshold 部，就利用已有電影資料去加總
if pd_count_number < compare_movie_threshold:
    for i in range(pd_count_number):
        data = user_like_pd.iloc[i]
        total_movie_cosine_similarity += movie_cos_sim[data['item_id'] - 1]

# 假如使用者超過 compare_movie_threshold 部，就隨機選取電影去加總
else:
    # 隨機選取 compare_movie_threshold 部電影
    random_index_list = random.sample(range(pd_count_number), compare_movie_threshold)
    for i in random_index_list:
        data = user_like_pd.iloc[i]
        total_movie_cosine_similarity += movie_cos_sim[data['item_id'] - 1]

print('相似度加總完成')

# 相似度排序，並取出前 amount_of_recommended_movies 部電影
recommands = pd.DataFrame(total_movie_cosine_similarity[0], columns=['cosine_similarity'])
recommands = pd.merge(recommands, movies[['item_id']], left_index=True, right_index=True)
                                          
# 相似度排序，並取出前 amount_of_recommended_movies 部電影
recommands = recommands.sort_values(by=['cosine_similarity'], ascending=False)
recommands = recommands.iloc[:amount_of_recommended_movies]

# 輸出推薦電影的標題
print('推薦電影列表:')
for i in range(len(recommands)):
    data = recommands.iloc[i]
    print(movies[movies['item_id'] == data['item_id']]['title'].values[0])

資料載入完成
選出符合條件的電影完成        user_id  item_id  rating  timestamp
4234       169      301       4  891268622
4576       169      174       4  891359418
5359       169      879       5  891268653
5997       169      603       5  891359171
6271       169      172       5  891359317
6893       169      181       5  891359276
10065      169      133       4  891359171
10979      169      127       4  891359354
12353      169      604       4  891359317
16011      169      705       5  891359354
17196      169      234       4  891359418
20358      169      211       5  891359200
20919      169      213       5  891359354
25578      169      684       5  891359354
27684      169      331       5  891268491
36069      169      480       4  891359137
37319      169      300       5  891268491
37861      169      258       5  891268552
47092      169      134       5  891359250
47570      169       50       5  891359250
64829      169      443       4  891359418
66045      169      538       4  89

# Item-base 細節

In [1]:
#粗略查看資料型態
#因方便辨識有重新命名檔案(不然都是u) 所以才會有u.data.data這種鬼東西
#因方便辨識有重新命名檔案(不然都是u) 所以才會有u.data.data這種鬼東西
#因方便辨識有重新命名檔案(不然都是u) 所以才會有u.data.data這種鬼東西

import pandas as pd
import numpy as np
import numpy.matlib 
import random

#u.data 評分資料
rating_header = ["user_id", "item_id", "rating", "timestamp"]
rating = pd.read_csv("u.data.data", sep = '\t', header = None, names=rating_header)
print(rating.head())
print('\n')
rating.info()
print("__________________________________________________________________________\n")

#u.item 電影資料
movie_header = ["item_id", "title", "release_date", "video_release_date", "IMDb_URL",
         "unknown", "Action", "Adventure", "Animation","Children's", "Comedy", "Crime",
         "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", 
         "Romance", "Sci-Fi", "Thriller", "War", "Western"]
movies = pd.read_csv('u.item.item', sep = '|', header = None, encoding = 'latin1', names = movie_header)
print(movies.head())
print('\n')
movies.info()
print("__________________________________________________________________________\n")

#u.user 使用者資料
user_header = ["user_id", "age", "gender", "occupation", "zip_code"]
user = pd.read_csv('u.user.user', sep = '|', header = None, encoding = 'latin1', names = user_header)
print(user.head())
print('\n')
user.info()
print("__________________________________________________________________________\n")

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    100000 non-null  int64
 1   item_id    100000 non-null  int64
 2   rating     100000 non-null  int64
 3   timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB
__________________________________________________________________________

   item_id              title release_date  video_release_date  \
0        1   Toy Story (1995)  01-Jan-1995                 NaN   
1        2   GoldenEye (1995)  01-Jan-1995                 NaN   
2        3  Four Rooms (1995)  01-Jan-1995                 NaN   
3        4  Get Shorty (1995)  01-Jan-1995 

## 載入使用者和電影評分資料

In [2]:
from sklearn.metrics.pairwise import cosine_similarity

limit_data_count = 10000 # 怕讀取太久資料上限

data_pd = pd.read_csv("u.data.data", sep = '\t', header = None, names=rating_header)
data_pd = data_pd[:limit_data_count]

user_id_max = data_pd['user_id'].max()
item_id_max = data_pd['item_id'].max()
print('max user_id', user_id_max)
print('max item_id', item_id_max)
print('\n')

rank_matrix = np.matlib.zeros((item_id_max, user_id_max))
print(rank_matrix.shape)

data_pd

max user_id 389
max item_id 1529


(1529, 389)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
9995,74,150,3,888333458
9996,178,195,4,882826944
9997,321,527,3,879439763
9998,337,742,5,875184353


### 將 rating number 填入對應欄位

In [3]:
for i in range(len(data_pd)):
    data = data_pd.iloc[i]
    # id 都是從 1 開始所以要減一，才會對應到    
    rank_matrix[data['item_id'] - 1, data['user_id'] - 1] = data['rating']

### 確定填值正確

In [4]:
# 欄位值 - 1
rank_matrix[241,195]

3.0

### 計算出每個電影互相的 consine 值

In [5]:
movie_cos_sim = cosine_similarity(rank_matrix)

print(movie_cos_sim.shape)

(1529, 1529)


### 找出使用者喜歡的電影

In [6]:
target_user = 100
condition1 = data_pd['user_id'] == target_user

like_number_threshold = 3
condition2 = data_pd['rating'] > like_number_threshold

user_like_pd = data_pd[condition1 & condition2]
user_like_pd

Unnamed: 0,user_id,item_id,rating,timestamp
270,100,344,4,891374868
3175,100,355,4,891375313
4262,100,750,4,891375016
5142,100,302,4,891374528
7942,100,691,4,891375260
8793,100,316,5,891375313


### 利用喜歡的電影隨機五部去找出 對應總和最高的電影五部

In [7]:
like_movies = []

pd_count_number = len(user_like_pd)
print('pd_count_number: ', pd_count_number)

# 假如使用者沒有任何一部電影資料就回覆 空陣列
# if(pd_count_number == 0) :
#     return like_movies []


total_movie_cosine_similarity = np.matlib.zeros((1, item_id_max))

# 假如使用者少於五部，就利用已有電影資料去加總找最高五部
if(pd_count_number < 5):
    for i in range(0, pd_count_number - 1):
        print(i)
#         total_movie_cosine_similarity += movie_cos_sim[user_like_pd.iloc[_]]


# 假如使用者有大於等於五部電影資料，正常加總找最高五部
if(pd_count_number >= 5):
    for i in range(0, 5):
#         print(i)
        radom_number = random.randint(0,pd_count_number - 1)
        _item_id = user_like_pd.iloc[radom_number]['item_id']
        print(_item_id)
        print(movie_cos_sim[_item_id].sum())
        total_movie_cosine_similarity += movie_cos_sim[_item_id - 1] # 減一才是該對影對應的值

total_movie_cosine_similarity_pd = pd.DataFrame(total_movie_cosine_similarity[0])
total_movie_cosine_similarity_pd = total_movie_cosine_similarity_pd.T
print(total_movie_cosine_similarity)

pd_count_number:  6
302
38.04117588337469
344
14.353274032159973
750
39.25701092584433
691
73.82131443117547
344
14.353274032159973
[[0.12820637 0.20617059 0.08517389 ... 0.         0.         0.        ]]


In [8]:
total_movie_cosine_similarity_pd[0][749]

1.6125284685315293

### 使用者看過的電影相似度轉為 0

In [9]:
user_like_item_list = user_like_pd['item_id'].values
user_like_item_list = user_like_item_list -1
print(user_like_item_list)

for i in user_like_item_list:
    total_movie_cosine_similarity_pd[0][i] = 0

total_movie_cosine_similarity_pd

[343 354 749 301 690 315]


Unnamed: 0,0
0,0.128206
1,0.206171
2,0.085174
3,0.189110
4,0.000000
...,...
1524,0.000000
1525,0.000000
1526,0.000000
1527,0.000000


### 找到有相似度的電影

In [10]:
movie_similarity_pd = total_movie_cosine_similarity_pd[total_movie_cosine_similarity_pd > 0]
movie_similarity_pd = movie_similarity_pd.dropna()
movie_similarity_pd

Unnamed: 0,0
0,0.128206
1,0.206171
2,0.085174
3,0.189110
5,0.409286
...,...
1393,0.249688
1400,0.105830
1410,0.084747
1418,1.259213


### 找到排名高的電影項目

In [17]:
movie_similarity_pd = movie_similarity_pd.sort_values(by=0, ascending=False)
movie_similarity_pd[0:5]

Unnamed: 0,0
1418,1.259213
904,1.237565
569,1.000376
807,0.916212
265,0.807415


In [12]:
movies

top_5_movies = movie_similarity_pd[0:5]

top_5_movies.index.tolist()

[1418, 904, 569, 807, 265]

In [13]:
movies.filter(top_5_movies.index.tolist(), axis=0)

Unnamed: 0,item_id,title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1418,1419,Highlander III: The Sorcerer (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Highlander%20...,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
904,905,Great Expectations (1998),01-Jan-1998,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
569,570,Wyatt Earp (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Wyatt%20Earp%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
807,808,"Program, The (1993)",01-Jan-1993,,"http://us.imdb.com/M/title-exact?Program,%20Th...",0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
265,266,Kull the Conqueror (1997),29-Aug-1997,,http://us.imdb.com/M/title-exact?Kull+the+Conq...,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
print('原本使用者喜愛的電影 index：\n', user_like_pd['item_id'].values - 1)

recommands = pd.merge(top_5_movies, movies, left_index=True, right_index=True)
recommands = recommands.rename(columns={0: 'cosine_similarity'})
recommands.filter(['cosine_similarity', 'title'])
# recommands['cosine', 'title']

原本使用者喜愛的電影 index：
 [343 354 749 301 690 315]


Unnamed: 0,cosine_similarity,title
1418,1.259213,Highlander III: The Sorcerer (1994)
904,1.237565,Great Expectations (1998)
569,1.000376,Wyatt Earp (1994)
807,0.916212,"Program, The (1993)"
265,0.807415,Kull the Conqueror (1997)


In [16]:
# 小範例
vec1 = np.array([[1, 0, 0, 4],[0, 0, 7, 8] ,[9, 10, 3, 0]])
print(vec1.shape)
cos_sim = cosine_similarity(vec1)
print(cos_sim)

(3, 4)
[[1.         0.73010664 0.15835845]
 [0.73010664 1.         0.14331884]
 [0.15835845 0.14331884 1.        ]]
