In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from datetime import datetime, date
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
FILES_DIR = './files/'
events = pd.read_csv(FILES_DIR + 'sampled_events.csv')
product = pd.read_csv(FILES_DIR + 'sampled_products.csv')
users = pd.read_csv(FILES_DIR + 'sampled_users.csv')

In [4]:
events_df = events.copy()
events_sample = events_df.sample(frac=0.005, random_state=42)
drop_events = events_sample.copy()
drop_events['mobile_brand_name'].fillna('un_brand_name', inplace=True)
drop_events.drop('mobile_marketing_name', axis=1, inplace=True)
drop_events.drop('mobile_model_name', axis=1, inplace=True)
drop_events.drop('operating_system_version', axis=1, inplace=True)
drop_events['rating'] = drop_events['event_name']
re_name = {'rating' : {'click_item':1, 'like_item':2, 'add_to_cart':3, 'purchase_success':4}}
drop_events = drop_events.replace(re_name)
drop_events.dropna(axis=0, inplace=True)
df_events = drop_events.copy()
df_events

Unnamed: 0,session_id,event_timestamp,event_name,user_no,item_no,device_type,mobile_brand_name,country,region,platform,rating
1916969,434cf67167c5becc6321d03bdecd6c60,2021-06-24 09:25:17.097027,click_item,qwTrxFylir2ynPu9N7/g/w==,/ZasVJSuvWNKGQXzKT0uCg==,mobile,Apple,South Korea,Seoul,IOS,1
4178142,23288f3e8d799799645beb4bff6c35ad,2021-07-16 22:38:19.169111,click_item,OXbjPQaxKL7O8HoSMzJtiQ==,/YlZZk+EM3eAaLCGdgyiag==,mobile,Apple,South Korea,Seoul,IOS,1
644115,dc7c3da68a67b1f43a2226ea2733bebf,2021-06-09 23:25:54.245213,click_item,1jhscZ7MrX7w6sojiWm8cw==,TByIyj8aQeeoE2OnGWr4NQ==,mobile,Samsung,South Korea,Seoul,ANDROID,1
2047331,19422b436106a1cbc45bbe12bc59d11a,2021-06-25 22:37:42.032078,click_item,6+Uf31nW96b3VC61EfO4SQ==,F+25W5Hy9tVYnBlffA2Cmw==,mobile,Apple,South Korea,Chungcheongnam-do,IOS,1
2679735,1a963aba0fb154be005ab97ddbf780aa,2021-07-02 08:51:53.27311,click_item,Zfy9Vola1OIOtuunlXev+A==,VoEeDect6p+wMXMoE5zAkw==,mobile,Samsung,South Korea,Seoul,ANDROID,1
...,...,...,...,...,...,...,...,...,...,...,...
594921,4b2b5f6db5589fa7f736b0bd31f87685,2021-06-09 14:33:30.903094,click_item,3df2n7oVw8oZ3MuLxK5KYg==,GD6DtsasqVJamffP55apQA==,mobile,Apple,South Korea,Gyeonggi-do,IOS,1
1256645,288d50850a8b0e43f2f2e6226a33e53a,2021-06-17 08:53:13.703111,click_item,2d9zQSt+DExdfvP/mj3Juw==,OCVwIGJhWNYNIwAxdI/Lag==,mobile,Samsung,South Korea,Seoul,ANDROID,1
4173205,95cdce3d28de536e37dc837cedb58f21,2021-07-16 21:55:49.05804,add_to_cart,fxAptePuBEdVPekHLnmFnw==,mAznRa3p9hv4rhWkunWMFQ==,mobile,Samsung,South Korea,Gyeonggi-do,ANDROID,3
3952661,5efe98cef53495a0ccec290e57e367fa,2021-07-14 17:17:28.635075,click_item,0h0lKPYrJcPeQlwIjKRohg==,tKZFh9pXCjI7VKLT3Pt1tw==,mobile,Apple,South Korea,Jeollabuk-do,IOS,1


In [5]:
today = pd.Timestamp.now().floor('D')
users['birth_date'] = pd.to_datetime(users['birth_date'], errors='coerce')
users = users[~pd.isnull(users['birth_date'])] # remove rows with missing birth date
users['age'] = ((today - users['birth_date']).dt.days / 365.25).astype(int)

def age_category(age):
    age = (age // 10) * 10
    return age

users['age'] = users.age.apply(age_category)

In [81]:
total_df = pd.merge(df_events, users)
total_df = pd.merge(total_df, product)
total_df['birth_date'].fillna('un_birth_date', inplace=True)
total_df['gender'].fillna('un_gender', inplace=True)
total_df.isnull().sum()

session_id           0
event_timestamp      0
event_name           0
user_no              0
item_no              0
device_type          0
mobile_brand_name    0
country              0
region               0
platform             0
rating               0
birth_date           0
gender               0
age                  0
item_name            0
image_name           0
price                0
category1_code       0
category1_name       0
category2_code       0
category2_name       0
category3_code       0
category3_name       0
brand_no             0
brand_name           0
dtype: int64

In [None]:
user_group = pd.DataFrame({'count': total_df.groupby(['user_no', 'item_no', 'brand_name']).size()})
user_group = user_group.reset_index()

user_group2 = total_df.groupby('user_no')['rating'].mean().reset_index()

user_group3 = pd.merge(user_group, user_group2, on='user_no', how='inner')

user_group3 = user_group3

user_group_df = user_group3.sort_values(by='rating', ascending=False)

user_group_df = user_group_df.drop_duplicates(subset='user_no', keep='first')

user_group_df

In [46]:
session_group = pd.DataFrame({'count': total_df.groupby(['session_id', 'item_no']).size()})
session_group = session_group.reset_index()

session_group2 = total_df.groupby('session_id')['rating'].mean().reset_index()

session_group3 = pd.merge(session_group, session_group2, on='session_id', how='inner')

session_group3 = session_group3

session_group_df = session_group3.sort_values(by='rating', ascending=False)

# session_group_df = session_group_df.drop_duplicates(subset='session_id', keep='first')

session_group_df = session_group_df[session_group_df['count'] == 1]

session_group_df

Unnamed: 0,session_id,item_no,count,rating
1714385,5cf592a299f6ff84d724e0594a0d7ef4,WDsocKZikCnNJFjhJBozaA==,1,4.0
1702362,5c4d1191554537837b70f89dd63887d4,rPIBSlM0NFKc7uh+jx2Wqg==,1,4.0
1702360,5c4d050a796298657c0948930c541bde,eaeqS5+rrkxpqCq2FsnQUQ==,1,4.0
1702359,5c4d050a796298657c0948930c541bde,W8LPoAwGzuJMcbGs2W1n8Q==,1,4.0
1702351,5c4cefaa9f01e6322f76fa87268916cc,+8yssy/qbgiZZR+kBcCIoA==,1,4.0
...,...,...,...,...
1943904,6984c2eb12e0e67ec556546711849873,uzg+DozwumHulRYb8dpv9g==,1,1.0
1943905,6984ce93cf6b8d9e2e263aec4fff7a27,96tDZ6snmsCepS9wT/j2LA==,1,1.0
1943906,6984d8f94c790e425c01c32c982c0a72,q+KkbP9a7V0tnfukM2XmtA==,1,1.0
1943907,6984db5297dddccf67c27b75aa0325cc,DmBAWBNcFSL/FPsERDph1w==,1,1.0


In [47]:
session_group_df = session_group_df.sample(frac=0.005, random_state=42)
session_group_df.shape

(20408, 4)

In [49]:
# train, test split
x = session_group_df.copy()
y = session_group_df['session_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# x_train.drop_duplicates(subset=['session_id', 'item_no'], inplace=True)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model, neighbor_size=0): # neighbor_size 파라미터 추가
  id_pairs = zip(x_test['session_id'], x_test['item_no'])
  y_pred = np.array([model(user, item, neighbor_size) for (user, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)



rating_matrix = x_train.pivot(index='session_id', columns='item_no', values='rating')

MemoryError: Unable to allocate 1.35 GiB for an array with shape (15109, 11961) and data type float64

In [48]:
# rating_matrix = total_df.pivot_table(index='user_no', columns='item_no', values='rating') 
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = pd.DataFrame(cosine_similarity(matrix_dummy), index=rating_matrix.index, columns=rating_matrix.index)

MemoryError: Unable to allocate 4.59 GiB for an array with shape (20688, 29788) and data type float64

In [None]:
def CF_knn(user_no, item_no, neighbor_size=0):
  if item_no in rating_matrix:
    sim_scores = user_similarity[user_no].copy()
    itme_ratings = rating_matrix[item_no].copy()
    none_rating_idx = itme_ratings[itme_ratings.isnull()].index 
    itme_ratings = itme_ratings.dropna()
    sim_scores = sim_scores.drop(none_rating_idx)
    
    if neighbor_size == 0:
      mean_rating = np.dot(sim_scores, itme_ratings) / sim_scores.sum()
    else:
      if len(sim_scores) > 1:
        neighbor_size = min(neighbor_size, len(sim_scores))
        sim_scores = np.array(sim_scores)
        itme_ratings = np.array(itme_ratings)
        user_idx = np.argsort(sim_scores)
        sim_scores = sim_scores[user_idx][-neighbor_size:]
        itme_ratings = itme_ratings[user_idx][-neighbor_size:]
        mean_rating = np.dot(sim_scores, itme_ratings) / sim_scores.sum()
      else:
        mean_rating = 2.5

  else:
    mean_rating = 2.5
  
  return mean_rating

print('KNN 방식의 정확도:', score(CF_knn, neighbor_size=5))

KeyError: 'dwQQJVq8Mk+Z443d4U8UUg=='

In [57]:
for neighbor_size in [10, 15, 20, 25, 35, 40]:
  print("이웃의 크기 = %d : RMSE = %.4f" % (neighbor_size, score(CF_knn, neighbor_size)))

이웃의 크기 = 10 : RMSE = 1.1361
이웃의 크기 = 15 : RMSE = 1.1366
이웃의 크기 = 20 : RMSE = 1.1368
이웃의 크기 = 25 : RMSE = 1.1369
이웃의 크기 = 35 : RMSE = 1.1366
이웃의 크기 = 40 : RMSE = 1.1365


In [None]:
drop_events = events.copy()
drop_events['mobile_brand_name'].fillna('un_brand_name', inplace=True)
drop_events.drop('mobile_marketing_name', axis=1, inplace=True)
drop_events.drop('mobile_model_name', axis=1, inplace=True)
drop_events.drop('operating_system_version', axis=1, inplace=True)
drop_events['rating'] = drop_events['event_name']
re_name = {'rating' : {'click_item':1, 'like_item':2, 'add_to_cart':3, 'purchase_success':4}}
drop_events = drop_events.replace(re_name)
drop_events.dropna(axis=0, inplace=True)
df_events = drop_events.copy()
df_events

In [5]:
today = pd.Timestamp.now().floor('D')
users['birth_date'] = pd.to_datetime(users['birth_date'], errors='coerce')
users = users[~pd.isnull(users['birth_date'])] # remove rows with missing birth date
users['age'] = ((today - users['birth_date']).dt.days / 365.25).astype(int)

def age_category(age):
    age = (age // 10) * 10
    return age

users['age'] = users.age.apply(age_category)

In [None]:
total_df = pd.merge(df_events, users)
total_df = pd.merge(total_df, product)
total_df['birth_date'].fillna('un_birth_date', inplace=True)
total_df['gender'].fillna('un_gender', inplace=True)
total_df.isnull().sum()

In [27]:
session_group = pd.DataFrame({'count': total_df.groupby(['session_id', 'item_no']).size()})
session_group = session_group.reset_index()

session_group2 = total_df.groupby('session_id')['rating'].mean().reset_index()

session_group3 = pd.merge(session_group, session_group2, on='session_id', how='inner')

session_group3 = session_group3

session_group_df = session_group3.sort_values(by='rating', ascending=False)

# session_group_df = session_group_df.drop_duplicates(subset='session_id', keep='first')

session_group_df = session_group_df[session_group_df['count'] == 1]

session_group_df

Unnamed: 0,session_id,item_no,count,rating
1714385,5cf592a299f6ff84d724e0594a0d7ef4,WDsocKZikCnNJFjhJBozaA==,1,4.0
1702362,5c4d1191554537837b70f89dd63887d4,rPIBSlM0NFKc7uh+jx2Wqg==,1,4.0
1702360,5c4d050a796298657c0948930c541bde,eaeqS5+rrkxpqCq2FsnQUQ==,1,4.0
1702359,5c4d050a796298657c0948930c541bde,W8LPoAwGzuJMcbGs2W1n8Q==,1,4.0
1702351,5c4cefaa9f01e6322f76fa87268916cc,+8yssy/qbgiZZR+kBcCIoA==,1,4.0
...,...,...,...,...
1943904,6984c2eb12e0e67ec556546711849873,uzg+DozwumHulRYb8dpv9g==,1,1.0
1943905,6984ce93cf6b8d9e2e263aec4fff7a27,96tDZ6snmsCepS9wT/j2LA==,1,1.0
1943906,6984d8f94c790e425c01c32c982c0a72,q+KkbP9a7V0tnfukM2XmtA==,1,1.0
1943907,6984db5297dddccf67c27b75aa0325cc,DmBAWBNcFSL/FPsERDph1w==,1,1.0


In [28]:
session_group_df = session_group_df.sample(frac=0.001, random_state=42)
session_group_df.shape

(4082, 4)

In [29]:
train_df, test_df = train_test_split(session_group_df, test_size=0.25, random_state=42)



sparse_matrix = session_group_df.groupby('item_no').apply(lambda x: pd.Series(x['rating'].values, index=x['session_id'])).unstack()
sparse_matrix.index.name = 'item_no'


sparse_matrix

session_id,0003fac10771961b4122f63b235deca1,000880dc37473c67f0f7991e9e169c67,00144477e798c0dfaa99a3abd969b8a1,0015c11109350e05cf9ac973c553643a,002939f1a041b00788eb0025e8f0bb0c,003ec3e44d546b67f6b5c512238d4304,003fefa34ba4d26b15bb45d552579976,0054cfd230a0da103de44bdaa57cc598,0059c2ce05319e1832c4fb9adf08d255,005db8308069cc2c22b7f6b0692fbcbd,...,ff0867625d4e17c87b90ecae39b7ba90,ff106e6c37bdf72c1903080b91d5b2c8,ff33e64b46c061c3501190b4f3e0cb87,ff43c056d628b4dfd6b54a113c98d32e,ff451b4a456150649bc16b979c93a383,ff687d1f9128fab378796dcd1baf863c,ff9f03face3211192de5d9f9490f3191,ffa2b78f0ed5549a95ed857ab5016965,ffbfa87e54d81f2cb5401f129c69f577,ffd5af5dda445b914bc9c2daa70d2f41
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
+/q5sKxV232w5KMN4Ijt9g==,,,,,,,,,,,...,,,,,,,,,,
+3FJt9tJqizkJlKj2CRsKw==,,,,,,,,,,,...,,,,,,,,,,
+3XEtFLRFtOIVkQWUwO4kg==,,,,,,,,,,,...,,,,,,,,,,
+3lq6mO8vsqH8aJyVyOeXA==,,,,,,,,,,,...,,,,,,,,,,
+4CDn2otRz9zaU8LarSAdQ==,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zqHr0O3ZhZXHB6PdQiJZbw==,,,,,,,,,,,...,,,,,,,,,,
zssjxb2jveySzAZLd5IuvQ==,,,,,,,,,,,...,,,,,,,,,,
ztM6baNM0JewDL2ounZh8A==,,,,,,,,,,,...,,,,,,,,,,
ztZE91n1Ku+4YozoWdNBlQ==,,,,,,,,,,,...,,,,,,,,,,


In [30]:
sparse_matrix.nunique(axis=0, dropna=True)

session_id
0003fac10771961b4122f63b235deca1    1
000880dc37473c67f0f7991e9e169c67    1
00144477e798c0dfaa99a3abd969b8a1    1
0015c11109350e05cf9ac973c553643a    1
002939f1a041b00788eb0025e8f0bb0c    1
                                   ..
ff687d1f9128fab378796dcd1baf863c    1
ff9f03face3211192de5d9f9490f3191    1
ffa2b78f0ed5549a95ed857ab5016965    1
ffbfa87e54d81f2cb5401f129c69f577    1
ffd5af5dda445b914bc9c2daa70d2f41    1
Length: 4063, dtype: int64

In [31]:
def cossim_matrix(a, b):
    cossim_values = cosine_similarity(a.values, b.values)
    cossim_df = pd.DataFrame(data=cossim_values, columns = a.index.values, index=a.index)

    return cossim_df

In [32]:
item_sparse_matrix = sparse_matrix.fillna(0)
item_cossim_df = cossim_matrix(item_sparse_matrix, item_sparse_matrix)

In [33]:
item_cossim_df = cossim_matrix(item_sparse_matrix, item_sparse_matrix)
item_cossim_df

Unnamed: 0_level_0,+/q5sKxV232w5KMN4Ijt9g==,+3FJt9tJqizkJlKj2CRsKw==,+3XEtFLRFtOIVkQWUwO4kg==,+3lq6mO8vsqH8aJyVyOeXA==,+4CDn2otRz9zaU8LarSAdQ==,+4cs+Uppa4EbNMY+e5WjHw==,+4mdbnnJRgvHUM9czNqSTQ==,+53zKkxZMq/PPeySZMRugQ==,+5W1mAOeJMvhPXISwioqrQ==,+5X4mYx4jw0xP63kCnS39g==,...,zo7Ol7HNkwTgVS6toeBFCg==,zofLsa+q1xGDf3mpG3rN8Q==,zostcyePAU51M10/YKRRGQ==,zpS2LVOo6hw74pIg8OE1yw==,zq4jsHQo1lnhAtF+WvC6rA==,zqHr0O3ZhZXHB6PdQiJZbw==,zssjxb2jveySzAZLd5IuvQ==,ztM6baNM0JewDL2ounZh8A==,ztZE91n1Ku+4YozoWdNBlQ==,zyNpGfb153RW0Imlf8pTSw==
item_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
+/q5sKxV232w5KMN4Ijt9g==,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+3FJt9tJqizkJlKj2CRsKw==,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+3XEtFLRFtOIVkQWUwO4kg==,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+3lq6mO8vsqH8aJyVyOeXA==,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+4CDn2otRz9zaU8LarSAdQ==,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zqHr0O3ZhZXHB6PdQiJZbw==,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
zssjxb2jveySzAZLd5IuvQ==,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
ztM6baNM0JewDL2ounZh8A==,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
ztZE91n1Ku+4YozoWdNBlQ==,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [34]:
session_id_grouped = train_df.groupby('session_id')

item_prediction_result_df = pd.DataFrame(index=list(session_id_grouped.indices.keys()), columns=item_sparse_matrix.index)
item_prediction_result_df

item_no,+/q5sKxV232w5KMN4Ijt9g==,+3FJt9tJqizkJlKj2CRsKw==,+3XEtFLRFtOIVkQWUwO4kg==,+3lq6mO8vsqH8aJyVyOeXA==,+4CDn2otRz9zaU8LarSAdQ==,+4cs+Uppa4EbNMY+e5WjHw==,+4mdbnnJRgvHUM9czNqSTQ==,+53zKkxZMq/PPeySZMRugQ==,+5W1mAOeJMvhPXISwioqrQ==,+5X4mYx4jw0xP63kCnS39g==,...,zo7Ol7HNkwTgVS6toeBFCg==,zofLsa+q1xGDf3mpG3rN8Q==,zostcyePAU51M10/YKRRGQ==,zpS2LVOo6hw74pIg8OE1yw==,zq4jsHQo1lnhAtF+WvC6rA==,zqHr0O3ZhZXHB6PdQiJZbw==,zssjxb2jveySzAZLd5IuvQ==,ztM6baNM0JewDL2ounZh8A==,ztZE91n1Ku+4YozoWdNBlQ==,zyNpGfb153RW0Imlf8pTSw==
0003fac10771961b4122f63b235deca1,,,,,,,,,,,...,,,,,,,,,,
00144477e798c0dfaa99a3abd969b8a1,,,,,,,,,,,...,,,,,,,,,,
002939f1a041b00788eb0025e8f0bb0c,,,,,,,,,,,...,,,,,,,,,,
003ec3e44d546b67f6b5c512238d4304,,,,,,,,,,,...,,,,,,,,,,
003fefa34ba4d26b15bb45d552579976,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff451b4a456150649bc16b979c93a383,,,,,,,,,,,...,,,,,,,,,,
ff687d1f9128fab378796dcd1baf863c,,,,,,,,,,,...,,,,,,,,,,
ffa2b78f0ed5549a95ed857ab5016965,,,,,,,,,,,...,,,,,,,,,,
ffbfa87e54d81f2cb5401f129c69f577,,,,,,,,,,,...,,,,,,,,,,


In [35]:
for session_id, group in tqdm(session_id_grouped):
    # user가 rating한 movieId * 전체 movieId
    session_id_sim = item_cossim_df.loc[group['item_no']]
    # user가 rating한 movieId * 1
    user_rating = group['rating']
    # 전체 movieId * 1
    sim_sum = session_id_sim.sum(axis=0)

    # userId의 전체 rating predictions (8938 * 1)
    pred_ratings = np.matmul(session_id_sim.T.to_numpy(), user_rating) / (sim_sum+1)
    item_prediction_result_df.loc[session_id] = pred_ratings

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for session_id, group in tqdm(session_id_grouped):


  0%|          | 0/3048 [00:00<?, ?it/s]

In [36]:
item_prediction_result_df.head(10)

item_no,+/q5sKxV232w5KMN4Ijt9g==,+3FJt9tJqizkJlKj2CRsKw==,+3XEtFLRFtOIVkQWUwO4kg==,+3lq6mO8vsqH8aJyVyOeXA==,+4CDn2otRz9zaU8LarSAdQ==,+4cs+Uppa4EbNMY+e5WjHw==,+4mdbnnJRgvHUM9czNqSTQ==,+53zKkxZMq/PPeySZMRugQ==,+5W1mAOeJMvhPXISwioqrQ==,+5X4mYx4jw0xP63kCnS39g==,...,zo7Ol7HNkwTgVS6toeBFCg==,zofLsa+q1xGDf3mpG3rN8Q==,zostcyePAU51M10/YKRRGQ==,zpS2LVOo6hw74pIg8OE1yw==,zq4jsHQo1lnhAtF+WvC6rA==,zqHr0O3ZhZXHB6PdQiJZbw==,zssjxb2jveySzAZLd5IuvQ==,ztM6baNM0JewDL2ounZh8A==,ztZE91n1Ku+4YozoWdNBlQ==,zyNpGfb153RW0Imlf8pTSw==
0003fac10771961b4122f63b235deca1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00144477e798c0dfaa99a3abd969b8a1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002939f1a041b00788eb0025e8f0bb0c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
003ec3e44d546b67f6b5c512238d4304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
003fefa34ba4d26b15bb45d552579976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0059c2ce05319e1832c4fb9adf08d255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
005db8308069cc2c22b7f6b0692fbcbd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
005f03c053a6c204a6288b79d9562538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0084c6416265194f2c3e90f5c0b0ce89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
008ba6581bee62934e6532274fa3de20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
def evaluate(test_df, prediction_result_df):
  groups_with_movie_ids = test_df.groupby(by='item_no')
  groups_with_user_ids = test_df.groupby(by='session_id')
  intersection_movie_ids = sorted(list(set(list(prediction_result_df.columns)).intersection(set(list(groups_with_movie_ids.indices.keys())))))
  intersection_user_ids = sorted(list(set(list(prediction_result_df.index)).intersection(set(groups_with_user_ids.indices.keys()))))

  print(len(intersection_movie_ids))
  print(len(intersection_user_ids))

  compressed_prediction_df = prediction_result_df.loc[intersection_user_ids][intersection_movie_ids]
  # compressed_prediction_df

  # test_df에 대해서 RMSE 계산
  grouped = test_df.groupby(by='session_id')
  result_df = pd.DataFrame(columns=['rmse'])
  for session_id, group in tqdm(grouped):
      if session_id in intersection_user_ids:
          pred_ratings = compressed_prediction_df.loc[session_id][compressed_prediction_df.loc[session_id].index.intersection(list(group['item_no'].values))]
          pred_ratings = pred_ratings.to_frame(name='rating').reset_index().rename(columns={'index':'item_no','rating':'pred_rating'})
          actual_ratings = group[['rating', 'item_no']].rename(columns={'rating':'actual_rating'})

          final_df = pd.merge(actual_ratings, pred_ratings, how='inner', on=['item_no'])
          final_df = final_df.round(4) # 반올림

          # if not final_df.empty:
          #     rmse = sqrt(mean_squared_error(final_df['rating_actual'], final_df['rating_pred']))
          #     result_df.loc[session_id] = rmse
          #     # print(session_id, rmse)
    
  return final_df

In [39]:
evaluate(test_df, item_prediction_result_df)

1000
6


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for session_id, group in tqdm(grouped):


  0%|          | 0/1021 [00:00<?, ?it/s]

Unnamed: 0,actual_rating,item_no,pred_rating
0,1.0,6XaV8phNvEmjJ4X0Yc85Sw==,0.5


In [40]:
result_df = evaluate(test_df, item_prediction_result_df)
print(result_df)
print(f"RMSE: {sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))}")

1000
6


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for session_id, group in tqdm(grouped):


  0%|          | 0/1021 [00:00<?, ?it/s]

   actual_rating                   item_no pred_rating
0            1.0  6XaV8phNvEmjJ4X0Yc85Sw==         0.5
RMSE: 0.5
