In [1]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm_notebook as tqdm 

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error # loss function 

In [13]:
ratings_df = pd.read_csv('data/ratings.csv', encoding='utf-8' , index_col=0)
movies_df = pd.read_csv('data/result_movie.csv',encoding='utf-8',index_col=0)

In [3]:
user_ids = sorted(list(set(ratings_df['userId_id'].values)))
movie_ids = sorted(list(set(ratings_df['movieId'].values)))

In [4]:
sparse_matrix = pd.DataFrame(index=movie_ids , columns=user_ids)

In [5]:
sparse_matrix = ratings_df.pivot(index='movieId',columns='userId_id',values='rating')

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

def cossim_matrix(a, b):
    cossim_values = cosine_similarity(a.values, b.values)
    cossim_df = pd.DataFrame(data=cossim_values, columns = a.index.values, index=a.index)

    return cossim_df

In [7]:
item_sparse_matrix = sparse_matrix.fillna(0)

In [115]:
item_cossim_df = cossim_matrix(item_sparse_matrix, item_sparse_matrix)
item_cossim_df

Unnamed: 0_level_0,1,2,3,5,6,7,9,10,11,14,...,134368,134853,138036,139385,142488,148626,152081,164179,166528,176371
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.410562,0.296917,0.308762,0.376316,0.277491,0.232586,0.395573,0.323976,0.178144,...,0.093093,0.306236,0.164749,0.190320,0.187171,0.195092,0.192923,0.188209,0.174998,0.140250
2,0.410562,1.000000,0.282438,0.287795,0.297009,0.228576,0.044835,0.417693,0.322252,0.099373,...,0.133123,0.214336,0.169038,0.186162,0.140917,0.166895,0.215054,0.194696,0.233863,0.203355
3,0.296917,0.282438,1.000000,0.417802,0.284257,0.402831,0.304840,0.242954,0.249568,0.176544,...,0.023965,0.047096,0.015985,0.065771,0.075475,0.048176,0.039672,0.053996,0.056977,0.078187
5,0.308762,0.287795,0.417802,1.000000,0.298969,0.474002,0.335058,0.218061,0.272182,0.255333,...,0.077755,0.091252,0.052843,0.073468,0.046137,0.102923,0.045206,0.055172,0.061720,0.045416
6,0.376316,0.297009,0.284257,0.298969,1.000000,0.244105,0.214088,0.386414,0.289365,0.287064,...,0.052090,0.153317,0.099389,0.167784,0.126655,0.150291,0.095166,0.213196,0.177203,0.174638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148626,0.195092,0.166895,0.048176,0.102923,0.150291,0.044639,0.016727,0.083328,0.074458,0.000000,...,0.348503,0.431625,0.336723,0.450637,0.543839,1.000000,0.260862,0.471627,0.312948,0.485044
152081,0.192923,0.215054,0.039672,0.045206,0.095166,0.032749,0.000000,0.075879,0.047165,0.000000,...,0.400781,0.601073,0.382704,0.343479,0.291736,0.260862,1.000000,0.365101,0.443488,0.360249
164179,0.188209,0.194696,0.053996,0.055172,0.213196,0.052497,0.016702,0.126859,0.066301,0.000000,...,0.216599,0.344605,0.299578,0.474764,0.427710,0.471627,0.365101,1.000000,0.504255,0.508258
166528,0.174998,0.233863,0.056977,0.061720,0.177203,0.055538,0.016429,0.127093,0.072004,0.000000,...,0.145697,0.374077,0.426355,0.393330,0.307928,0.312948,0.443488,0.504255,1.000000,0.453601


In [9]:
userId_grouped = ratings_df.groupby('userId_id')
item_prediction_result_df = pd.DataFrame(index=list(userId_grouped.indices.keys()),
                                         columns=item_sparse_matrix.index)
item_prediction_result_df

movieId,1,2,3,5,6,7,9,10,11,14,...,134368,134853,138036,139385,142488,148626,152081,164179,166528,176371
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [10]:
for userId, group in tqdm(userId_grouped):
    # user가 rating한 movieId * 전체 movieId
    user_sim = item_cossim_df.loc[group['movieId']]
    # user가 rating한 movieId * 1
    user_rating = group['rating']
    # 전체 movieId * 1
    sim_sum = user_sim.sum(axis=0).map(lambda x : 1 if x==0 else x)
    #나눗셈을해줄때 0으로 나눠지는 것을 방지하기 위해서 0인 것들은 1로 바꿔줍니다.(0으로 나눠지면 NaN 발생)
    
    # 행렬 곱 진행 : 유사도 행렬(8938 , n번 유저의 평 갯수) matmul 평점행렬(n번 유저의 평 갯수, 1)
    # => ( 8938 , n ) mX (n , 1) => (8938 , 1) => 모든 영화에대한 예측평점
    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum)
    item_prediction_result_df.loc[userId] = pred_ratings

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))




In [15]:
item_base = item_prediction_result_df.loc[1].sort_values(ascending=False)[:20]

In [17]:
item_base_movieId = item_base.index
item_base_score = item_base.values

In [20]:
item_base_df = movies_df.loc[item_base_movieId]

In [25]:
item_base_df = pd.concat([item_base_df,item_base] ,axis=1)

In [28]:
item_base_df.rename(columns={1 : 'pred_socre'}, inplace=True)

In [29]:
item_base_df

Unnamed: 0_level_0,title_en,title_ko,genres,year,vote_average,vote_count,director,actor,poster,link,...,title_ko,genres,year,vote_average,vote_count,director,actor,poster,link,pred_socre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
96821,The Perks of Being a Wallflower (2012),월플라워,Drama|Romance,2012,7.7,3056.0,스티븐 크보스키,"['엠마 왓슨', '로건 레먼', '에즈라 밀러']",https://ssl.pstatic.net/imgmovie/mdi/mit110/08...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,...,월플라워,Drama|Romance,2012,7.7,3056.0,스티븐 크보스키,"['엠마 왓슨', '로건 레먼', '에즈라 밀러']",https://ssl.pstatic.net/imgmovie/mdi/mit110/08...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,4.442533
3435,Double Indemnity (1944),이중 배상,Crime|Drama|Film-Noir,1944,8.0,425.0,빌리 와일더,"['프레드 맥머레이', '바바라 스탠윅']",https://ssl.pstatic.net/imgmovie/mdi/mit110/02...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,...,이중 배상,Crime|Drama|Film-Noir,1944,8.0,425.0,빌리 와일더,"['프레드 맥머레이', '바바라 스탠윅']",https://ssl.pstatic.net/imgmovie/mdi/mit110/02...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,4.433549
101,Bottle Rocket (1996),바틀 로켓,Adventure|Comedy|Crime|Romance,1996,6.8,285.0,웨스 앤더슨,"['오웬 윌슨', '루크 윌슨', '로버트 머스그레이브', '앤드류 윌슨', '루미...",https://ssl.pstatic.net/imgmovie/mdi/mit110/01...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,...,바틀 로켓,Adventure|Comedy|Crime|Romance,1996,6.8,285.0,웨스 앤더슨,"['오웬 윌슨', '루크 윌슨', '로버트 머스그레이브', '앤드류 윌슨', '루미...",https://ssl.pstatic.net/imgmovie/mdi/mit110/01...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,4.426744
112552,Whiplash (2014),위플래쉬,Drama,2014,8.3,4376.0,데이미언 셔젤,"['마일즈 텔러', 'J.K. 시몬스']",https://ssl.pstatic.net/imgmovie/mdi/mit110/11...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,...,위플래쉬,Drama,2014,8.3,4376.0,데이미언 셔젤,"['마일즈 텔러', 'J.K. 시몬스']",https://ssl.pstatic.net/imgmovie/mdi/mit110/11...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,4.424879
3504,Network (1976),네트워크,Comedy|Drama,1976,7.8,391.0,시드니 루멧,"['페이 더너웨이', '윌리엄 홀든', '피터 핀치', '로버트 듀발']",https://ssl.pstatic.net/imgmovie/mdi/mit110/01...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,...,네트워크,Comedy|Drama,1976,7.8,391.0,시드니 루멧,"['페이 더너웨이', '윌리엄 홀든', '피터 핀치', '로버트 듀발']",https://ssl.pstatic.net/imgmovie/mdi/mit110/01...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,4.423323
2146,St. Elmo's Fire (1985),세인트 엘모의 열정,Drama|Romance,1985,6.0,133.0,조엘 슈마허,"['에밀리오 에스테베즈', '로브 로우', '앤드류 맥카시', '데미 무어', '쥬...",https://ssl.pstatic.net/imgmovie/mdi/mit110/01...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,...,세인트 엘모의 열정,Drama|Romance,1985,6.0,133.0,조엘 슈마허,"['에밀리오 에스테베즈', '로브 로우', '앤드류 맥카시', '데미 무어', '쥬...",https://ssl.pstatic.net/imgmovie/mdi/mit110/01...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,4.42293
111659,Maleficent (2014),말레피센트,Action|Adventure|Children|IMAX,2014,7.0,4607.0,로버트 스트롬버그,"['안젤리나 졸리', '엘르 패닝', '샬토 코플리', '주노 템플']",https://ssl.pstatic.net/imgmovie/mdi/mit110/09...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,...,말레피센트,Action|Adventure|Children|IMAX,2014,7.0,4607.0,로버트 스트롬버그,"['안젤리나 졸리', '엘르 패닝', '샬토 코플리', '주노 템플']",https://ssl.pstatic.net/imgmovie/mdi/mit110/09...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,4.420235
84944,Rango (2011),랭고,Action|Adventure|Animation|Children|Comedy|Wes...,2011,6.6,2094.0,고어 버빈스키,['조니 뎁'],https://ssl.pstatic.net/imgmovie/mdi/mit110/07...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,...,랭고,Action|Adventure|Animation|Children|Comedy|Wes...,2011,6.6,2094.0,고어 버빈스키,['조니 뎁'],https://ssl.pstatic.net/imgmovie/mdi/mit110/07...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,4.417303
4039,Annie (1982),애니,Children|Musical,1982,6.2,203.0,존 휴스턴,"['알버트 피니', '캐롤 버넷', '버나뎃 피터스', '앤 레인킹', '팀 커리'...",https://ssl.pstatic.net/imgmovie/mdi/mit110/01...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,...,애니,Children|Musical,1982,6.2,203.0,존 휴스턴,"['알버트 피니', '캐롤 버넷', '버나뎃 피터스', '앤 레인킹', '팀 커리'...",https://ssl.pstatic.net/imgmovie/mdi/mit110/01...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,4.417043
91529,The Dark Knight Rises (2012),다크 나이트 라이즈,Action|Adventure|Crime|IMAX,2012,7.6,9263.0,크리스토퍼 놀란,"['크리스찬 베일', '조셉 고든 레빗', '게리 올드만', '앤 해서웨이', '톰...",https://ssl.pstatic.net/imgmovie/mdi/mit110/07...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,...,다크 나이트 라이즈,Action|Adventure|Crime|IMAX,2012,7.6,9263.0,크리스토퍼 놀란,"['크리스찬 베일', '조셉 고든 레빗', '게리 올드만', '앤 해서웨이', '톰...",https://ssl.pstatic.net/imgmovie/mdi/mit110/07...,https://movie.naver.com/movie/bi/mi/basic.nhn?...,4.416915


In [30]:
item_base_list = [i[1] for i in item_base_df.iterrows()]

In [31]:
item_base_list

[title_en                   The Perks of Being a Wallflower (2012)
 title_ko                                                     월플라워
 genres                                              Drama|Romance
 year                                                         2012
 vote_average                                             7.700000
 vote_count                                            3056.000000
 director                                                 스티븐 크보스키
 actor                                ['엠마 왓슨', '로건 레먼', '에즈라 밀러']
 poster          https://ssl.pstatic.net/imgmovie/mdi/mit110/08...
 link            https://movie.naver.com/movie/bi/mi/basic.nhn?...
 title_en                   The Perks of Being a Wallflower (2012)
 title_ko                                                     월플라워
 genres                                              Drama|Romance
 year                                                         2012
 vote_average                                             7.70

In [111]:
user_sparse_matrix = sparse_matrix.fillna(0).transpose()

In [113]:
user_cossim_df = cossim_matrix(user_sparse_matrix, user_sparse_matrix)

In [114]:
user_cossim_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.037869,0.100261,0.223379,0.163606,0.175822,0.184722,0.144432,0.099703,0.022937,...,0.089545,0.188110,0.353374,0.088316,0.194791,0.244831,0.302467,0.343953,0.123091,0.204099
2,0.037869,1.000000,0.000000,0.005731,0.020680,0.039334,0.036683,0.033332,0.000000,0.107118,...,0.252059,0.022381,0.022257,0.000000,0.000000,0.053098,0.018158,0.065925,0.040379,0.166747
3,0.100261,0.000000,1.000000,0.006076,0.010961,0.010722,0.000000,0.010601,0.000000,0.000000,...,0.011860,0.011388,0.079182,0.000000,0.029323,0.037624,0.047353,0.052881,0.000000,0.071728
4,0.223379,0.005731,0.006076,1.000000,0.160527,0.109622,0.117678,0.067370,0.019533,0.049605,...,0.100738,0.122595,0.415711,0.053860,0.105461,0.297748,0.157972,0.181060,0.035484,0.158953
5,0.163606,0.020680,0.010961,0.160527,1.000000,0.400008,0.131564,0.479141,0.000000,0.044396,...,0.083210,0.461289,0.145935,0.311521,0.212313,0.162884,0.180409,0.172473,0.349434,0.098930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.244831,0.053098,0.037624,0.297748,0.162884,0.160673,0.287317,0.151689,0.119193,0.149795,...,0.262156,0.174857,0.471086,0.101970,0.231771,1.000000,0.224499,0.407753,0.107218,0.366555
607,0.302467,0.018158,0.047353,0.157972,0.180409,0.207452,0.210317,0.214378,0.018566,0.012994,...,0.102961,0.233052,0.290518,0.142287,0.172670,0.224499,1.000000,0.312274,0.224847,0.198808
608,0.343953,0.065925,0.052881,0.181060,0.172473,0.246527,0.368084,0.220108,0.128477,0.105790,...,0.164856,0.233383,0.355384,0.183694,0.235092,0.407753,0.312274,1.000000,0.171446,0.440799
609,0.123091,0.040379,0.000000,0.035484,0.349434,0.345728,0.129822,0.526249,0.000000,0.037151,...,0.051307,0.467669,0.096345,0.328547,0.163953,0.107218,0.224847,0.171446,1.000000,0.093558


In [120]:
movieId_grouped = ratings_df.groupby('movieId')
user_prediction_result_df = pd.DataFrame(index=list(movieId_grouped.indices.keys()),
                                         columns=user_sparse_matrix.index)
user_prediction_result_df

userId_id,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148626,,,,,,,,,,,...,,,,,,,,,,
152081,,,,,,,,,,,...,,,,,,,,,,
164179,,,,,,,,,,,...,,,,,,,,,,
166528,,,,,,,,,,,...,,,,,,,,,,


In [122]:
for movieId, group in movieId_grouped:
    user_sim = user_cossim_df.loc[group['userId_id']]
    user_rating = group['rating']
    sim_sum = user_sim.sum(axis=0).map(lambda x : 1 if x==0 else x)

    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / sim_sum
    user_prediction_result_df.loc[movieId] = pred_ratings

In [130]:
user_prediction_result_df.loc[610].sort_values(ascending=False)

KeyError: 610

In [131]:
user_prediction_result_df.T.loc[610].sort_values(ascending=False)

318     4.409045
3468    4.400733
1104    4.354565
3435    4.353866
1209    4.343475
          ...   
2450    1.922251
1381    1.905786
1499    1.898549
2643    1.719972
1556    1.602090
Name: 610, Length: 1323, dtype: object