In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from datetime import datetime, date
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
FILES_DIR = './files/'
events = pd.read_csv(FILES_DIR + 'sampled_events.csv')
product = pd.read_csv(FILES_DIR + 'sampled_products.csv')
users = pd.read_csv(FILES_DIR + 'sampled_users.csv')

In [None]:
drop_events = events.copy()
drop_events['mobile_brand_name'].fillna('un_brand_name', inplace=True)
drop_events.drop('mobile_marketing_name', axis=1, inplace=True)
drop_events.drop('mobile_model_name', axis=1, inplace=True)
drop_events.drop('operating_system_version', axis=1, inplace=True)
drop_events['rating'] = drop_events['event_name']
re_name = {'rating' : {'click_item':1, 'like_item':2, 'add_to_cart':3, 'purchase_success':4}}
drop_events = drop_events.replace(re_name)
drop_events.dropna(axis=0, inplace=True)
df_events = drop_events.copy()

total_df = pd.merge(df_events, users)
total_df = pd.merge(total_df, product)
total_df['birth_date'].fillna('un_birth_date', inplace=True)
total_df['gender'].fillna('un_gender', inplace=True)
total_df = total_df.dropna()
total_df.isnull().sum()

# session_id

In [None]:
session_group = pd.DataFrame({'count': total_df.groupby(['session_id', 'item_no']).size()})
session_group = session_group.reset_index()

session_group2 = total_df.groupby('session_id')['rating'].mean().reset_index()

session_group3 = pd.merge(session_group, session_group2, on='session_id', how='inner')

session_group3 = session_group3

session_group_df = session_group3.sort_values(by='rating', ascending=False)

# session_group_df = session_group_df.drop_duplicates(subset='session_id', keep='first')

session_group_df = session_group_df[session_group_df['count'] == 1]

session_group_df

In [None]:
session_group_df = session_group_df.sample(frac=0.003, random_state=42)
session_group_df.shape

In [None]:
# train, test split
x = session_group_df.copy()
y = session_group_df['session_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# x_train.drop_duplicates(subset=['session_id', 'item_no'], inplace=True)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model, neighbor_size=0): # neighbor_size 파라미터 추가
  id_pairs = zip(x_test['session_id'], x_test['item_no'])
  y_pred = np.array([model(user, item, neighbor_size) for (user, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)


In [None]:
rating_matrix = session_group_df.pivot_table(index='session_id', columns='item_no', values='rating') 
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = pd.DataFrame(cosine_similarity(matrix_dummy), index=rating_matrix.index, columns=rating_matrix.index)

In [None]:
def CF_knn(session_id, item_no, neighbor_size=0):
  if item_no in rating_matrix:
    sim_scores = user_similarity[session_id].copy()
    itme_ratings = rating_matrix[item_no].copy()
    none_rating_idx = itme_ratings[itme_ratings.isnull()].index 
    itme_ratings = itme_ratings.dropna()
    sim_scores = sim_scores.drop(none_rating_idx)
    
    if neighbor_size == 0:
      mean_rating = np.dot(sim_scores, itme_ratings) / sim_scores.sum()
    else:
      if len(sim_scores) > 1:
        neighbor_size = min(neighbor_size, len(sim_scores))
        sim_scores = np.array(sim_scores)
        itme_ratings = np.array(itme_ratings)
        user_idx = np.argsort(sim_scores)
        sim_scores = sim_scores[user_idx][-neighbor_size:]
        itme_ratings = itme_ratings[user_idx][-neighbor_size:]
        mean_rating = np.dot(sim_scores, itme_ratings) / sim_scores.sum()
      else:
        mean_rating = 2.5

  else:
    mean_rating = 2.5
  
  return mean_rating

print('KNN 방식의 정확도:', score(CF_knn, neighbor_size=5))

In [None]:
for neighbor_size in [10, 20, 30, 40, 50]:
  print("이웃의 크기 = %d : RMSE = %.4f" % (neighbor_size, score(CF_knn, neighbor_size)))

# user_no

In [None]:
session_group = pd.DataFrame({'count': total_df.groupby(['user_no', 'item_no']).size()})
session_group = session_group.reset_index()

session_group2 = total_df.groupby('user_no')['rating'].mean().reset_index()

session_group3 = pd.merge(session_group, session_group2, on='user_no', how='inner')

session_group3 = session_group3

session_group_df = session_group3.sort_values(by='rating', ascending=False)

# session_group_df = session_group_df.drop_duplicates(subset='user_no', keep='first')

session_group_df = session_group_df[session_group_df['count'] == 1]

session_group_df

In [None]:
session_group_df = session_group_df.sample(frac=0.003, random_state=42)
session_group_df.shape

In [None]:
# train, test split
x = session_group_df.copy()
y = session_group_df['user_no']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# x_train.drop_duplicates(subset=['user_no', 'item_no'], inplace=True)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model, neighbor_size=0): # neighbor_size 파라미터 추가
  id_pairs = zip(x_test['user_no'], x_test['item_no'])
  y_pred = np.array([model(user, item, neighbor_size) for (user, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)


In [None]:
rating_matrix = session_group_df.pivot_table(index='user_no', columns='item_no', values='rating') 
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = pd.DataFrame(cosine_similarity(matrix_dummy), index=rating_matrix.index, columns=rating_matrix.index)

In [32]:
rating_matrix = session_group_df.pivot_table(index='user_no', columns='item_no', values='rating') 
rating_matrix = rating_matrix.copy().fillna(0)
user_similarity = pd.DataFrame(cosine_similarity(rating_matrix), index=rating_matrix.index, columns=rating_matrix.index)

In [33]:
def CF_knn(user_no, item_no, neighbor_size=0):
  if item_no in rating_matrix:
    sim_scores = user_similarity[user_no].copy()
    itme_ratings = rating_matrix[item_no].copy()
    none_rating_idx = itme_ratings[itme_ratings.isnull()].index 
    itme_ratings = itme_ratings.dropna()
    sim_scores = sim_scores.drop(none_rating_idx)
    
    if neighbor_size == 0:
      mean_rating = np.dot(sim_scores, itme_ratings) / sim_scores.sum()
    else:
      if len(sim_scores) > 1:
        neighbor_size = min(neighbor_size, len(sim_scores))
        sim_scores = np.array(sim_scores)
        itme_ratings = np.array(itme_ratings)
        user_idx = np.argsort(sim_scores)
        sim_scores = sim_scores[user_idx][-neighbor_size:]
        itme_ratings = itme_ratings[user_idx][-neighbor_size:]
        mean_rating = np.dot(sim_scores, itme_ratings) / sim_scores.sum()
      else:
        mean_rating = 2.5

  else:
    mean_rating = 2.5
  
  return mean_rating

print('KNN 방식의 정확도:', score(CF_knn, neighbor_size=5))

KNN 방식의 정확도: 0.24818138689439453


In [None]:
for neighbor_size in [10, 20, 30, 40, 50]:
  print("이웃의 크기 = %d : RMSE = %.4f" % (neighbor_size, score(CF_knn, neighbor_size)))

In [35]:
# 각 사용자의 평점평균 구하기
rating_mean = rating_matrix.mean(axis=1)
# 각 아이템의 평점을 각 사용자의 평균에서의 차이로 변환하기
rating_bias = (rating_matrix.T - rating_mean).T


rating_binary1 = np.array((rating_matrix>0).astype(float))
rating_binary2 = rating_binary1.T
counts = pd.DataFrame(np.dot(rating_binary1, rating_binary2), 
                      index=rating_matrix.index, columns=rating_matrix.index).fillna(0)

In [37]:
def CF_knn_bias_sig_adj(user_no, item_no, neighbor_size=0):
  if item_no in rating_bias:
    sim_scores = user_similarity[user_no].copy()
    item_ratings = rating_bias[item_no].copy()
    
    no_rating = item_ratings.isnull() # 해당 영화에 대해 평가하지 않은 사용자를 True로 표시
    common_counts = counts[user_no]
    low_significance = common_counts < SIG_LEVEL # 추천 대상과의 공통 평가 영화수가 SIG_LEVEL보다 낮은 사용자를 True 표시 
    none_rating_idx = item_ratings[no_rating | low_significance].index # no_rating이 True이거나 low_significance가 True인 사용자 표시
    
    item_ratings = item_ratings.dropna()
    sim_scores = sim_scores.drop(none_rating_idx)
    
    if neighbor_size == 0:
      prediction = np.dot(sim_scores, item_ratings) / sim_scores.sum()
      prediction = prediction + rating_mean[user_no]

    else:
      if len(sim_scores) > MIN_RATINGS: # 해당 영화를 평가한 사용자 수가 MIN_RATINGS보다 큰 경우 예측값 계산
        neighbor_size = min(neighbor_size, len(sim_scores))
        sim_scores = np.array(sim_scores)
        item_ratings = np.array(item_ratings)
        user_idx = np.argsort(sim_scores)
        sim_scores = sim_scores[user_idx][-neighbor_size:]
        item_ratings = item_ratings[user_idx][-neighbor_size:]
        prediction = np.dot(sim_scores, item_ratings) / sim_scores.sum()
        prediction = prediction + rating_mean[user_no]
      else:
        prediction = rating_mean[user_no]

  else:
    prediction = rating_mean[user_no]
  
  # 이 부분 추가
  if prediction < 1.0:	
    prediction = 1.0
  elif prediction > 4.0:
    prediction = 4.0
    
  return prediction
  
SIG_LEVEL = 3
MIN_RATINGS = 2
print('최종 예측 값 수정한 협업 필터링 방식의 정확도:', score(CF_knn_bias_sig_adj, 10))

최종 예측 값 수정한 협업 필터링 방식의 정확도: 0.623972759798011
