In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from datetime import datetime, date
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
FILES_DIR = './files/'
events = pd.read_csv(FILES_DIR + 'sampled_events.csv')
product = pd.read_csv(FILES_DIR + 'sampled_products.csv')
users = pd.read_csv(FILES_DIR + 'sampled_users.csv')

In [5]:
drop_events = events.copy()
drop_events['mobile_brand_name'].fillna('un_brand_name', inplace=True)
drop_events.drop('mobile_marketing_name', axis=1, inplace=True)
drop_events.drop('mobile_model_name', axis=1, inplace=True)
drop_events.drop('operating_system_version', axis=1, inplace=True)
drop_events['rating'] = drop_events['event_name']
re_name = {'rating' : {'click_item':1, 'like_item':2, 'add_to_cart':3, 'purchase_success':4}}
drop_events = drop_events.replace(re_name)
drop_events.dropna(axis=0, inplace=True)
df_events = drop_events.copy()

total_df = pd.merge(df_events, users)
total_df = pd.merge(total_df, product)
total_df['birth_date'].fillna('un_birth_date', inplace=True)
total_df['gender'].fillna('un_gender', inplace=True)
total_df = total_df.dropna()
total_df.isnull().sum()

session_id           0
event_timestamp      0
event_name           0
user_no              0
item_no              0
device_type          0
mobile_brand_name    0
country              0
region               0
platform             0
rating               0
birth_date           0
gender               0
item_name            0
image_name           0
price                0
category1_code       0
category1_name       0
category2_code       0
category2_name       0
category3_code       0
category3_name       0
brand_no             0
brand_name           0
dtype: int64

In [6]:
user_no_group = pd.DataFrame({'count': total_df.groupby(['user_no', 'item_no']).size()})
user_no_group = user_no_group.reset_index()

user_no_group2 = total_df.groupby('user_no')['rating'].mean().reset_index()

user_no_group3 = pd.merge(user_no_group, user_no_group2, on='user_no', how='inner')

user_no_group3 = user_no_group3

user_no_group_df = user_no_group3.sort_values(by='rating', ascending=False)

# user_no_group_df = user_no_group_df.drop_duplicates(subset='user_no', keep='first')

user_no_group_df = user_no_group_df[user_no_group_df['count'] == 1]

user_no_group_df

Unnamed: 0,user_no,item_no,count,rating
118897,/e8MGKbl2wrC6qVJH+xM0Q==,Q0FgPI5gYPwi7sDRXYc2MA==,1,4.0
2878860,cJAyFas00J9NS0TrZfxu6Q==,1XZmTSXmTqyJd/e6YBpmJA==,1,4.0
2879605,cJrsLRCgtjdx7xT/V7vXuA==,/Ey/MRX6ThAjT9sGJfzGcw==,1,4.0
2879604,cJrCXqJAGsZtlVUpoeXtIg==,lEvGMbq4GpxxBdkFj317yQ==,1,4.0
2879571,cJmct2gEhoQu7nznOUXQwg==,/xeYMMr7eiWkLBXdjUetxg==,1,4.0
...,...,...,...,...
399349,3bh7V6rsJGrL4UwBOncvXA==,40hT9tQe+16oGaGgcNxnVg==,1,1.0
399351,3bh7V6rsJGrL4UwBOncvXA==,5JxTT0eYwoTJZUNinqmjzg==,1,1.0
399352,3bh7V6rsJGrL4UwBOncvXA==,5hZ+SkJkBUlB6WCwADoagg==,1,1.0
399353,3bh7V6rsJGrL4UwBOncvXA==,61p1cujNu2KF5DJqsvlCWA==,1,1.0


In [10]:
user_no_group_df = user_no_group_df.sample(frac=0.003, random_state=42)
user_no_group_df.shape

(11168, 4)

In [11]:
# train, test split
x = user_no_group_df.copy()
y = user_no_group_df['user_no']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# x_train.drop_duplicates(subset=['user_no', 'item_no'], inplace=True)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model):
  id_pairs = zip(x_test['user_no'], x_test['item_no'])
  y_pred = np.array([model(user, item) for (user, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)


In [13]:


rating_matrix = user_no_group_df.pivot(index='user_no', columns='item_no', values='rating')
# 아이템 기반 유사도 측정
rating_matrix_t = np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = pd.DataFrame(cosine_similarity(matrix_dummy), index=rating_matrix_t.index, columns=rating_matrix_t.index)

def CF_IBCF(user_no, item_no):
  if item_no in item_similarity:
    sim_scores = item_similarity[item_no]
    user_rating = rating_matrix_t[user_no]
    non_rating_idx = user_rating[user_rating.isnull()].index
    user_rating = user_rating.dropna()
    sim_scores = sim_scores.drop(non_rating_idx)
    mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
  else:
    mean_rating = 2.5
  
  return mean_rating
  
print('아이템 기반 협업 필터링 방식의 정확도:', score(CF_IBCF))

아이템 기반 협업 필터링 방식의 정확도: 3.4906560976408407e-17
