In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from datetime import datetime, date
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [2]:
FILES_DIR = './files/'
Eco_df = pd.read_csv(FILES_DIR + 'Eco_df.csv')

In [3]:
label = LabelEncoder()
label.fit(Eco_df['session_id'])
Eco_df['session_id'] = label.transform(Eco_df['session_id'])

In [4]:
label = LabelEncoder()
label.fit(Eco_df['user_no'])
Eco_df['user_no'] = label.transform(Eco_df['user_no'])

In [5]:
label = LabelEncoder()
label.fit(Eco_df['item_no'])
Eco_df['item_no'] = label.transform(Eco_df['item_no'])

In [6]:
group_data = pd.DataFrame({'count': Eco_df.groupby(['session_id', 'user_no', 'item_no']).size()})
group_data = group_data.reset_index()

group_data2 = Eco_df.groupby('session_id')['rating'].mean().reset_index()

group_data3 = pd.merge(group_data, group_data2, on='session_id', how='inner')

group_data3 = group_data3

group_data = group_data3.sort_values(by='rating', ascending=False)

# group_data = group_data.drop_duplicates(subset='session_id', keep='first')

# group_data = group_data[group_data['count'] == 1]

group_data

Unnamed: 0,session_id,user_no,item_no,count,rating
11965,11796,4400,10233,1,1.0
906,897,5776,6079,1,1.0
2688,2655,5095,1361,1,1.0
2689,2656,1115,11570,1,1.0
2691,2658,4814,193,1,1.0
...,...,...,...,...,...
6042,5964,1646,9512,1,0.1
6043,5965,11168,5446,1,0.1
6044,5966,8479,6257,1,0.1
6045,5967,534,10646,1,0.1


In [7]:
# train, test split
x = group_data.copy()
y = group_data['session_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# x_train.drop_duplicates(subset=['session_id', 'item_no'], inplace=True)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model, neighbor_size=0): # neighbor_size 파라미터 추가
  id_pairs = zip(x_test['session_id'], x_test['item_no'])
  y_pred = np.array([model(user, item, neighbor_size) for (user, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)

In [8]:
rating_matrix = group_data.pivot(index='session_id', columns='item_no', values='rating')
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = pd.DataFrame(cosine_similarity(matrix_dummy), index=rating_matrix.index, columns=rating_matrix.index)

In [9]:
# 각 사용자의 평점평균 구하기
rating_mean = rating_matrix.mean(axis=1)
# 각 아이템의 평점을 각 사용자의 평균에서의 차이로 변환하기
rating_bias = (rating_matrix.T - rating_mean).T


rating_binary1 = np.array((rating_matrix>0).astype(float))
rating_binary2 = rating_binary1.T
counts = pd.DataFrame(np.dot(rating_binary1, rating_binary2), 
                      index=rating_matrix.index, columns=rating_matrix.index).fillna(0)

In [10]:
def CF_knn_bias_sig_adj(session_id, item_no, neighbor_size=0):
  if item_no in rating_bias:
    sim_scores = user_similarity[session_id].copy()
    item_ratings = rating_bias[item_no].copy()
    
    no_rating = item_ratings.isnull() # 해당 아이템에 대해 rating 하지 않은 사용자를 True로 표시
    common_counts = counts[session_id]
    low_significance = common_counts < SIG_LEVEL # 유저간 공통 평가 아이템수가가 SIG_LEVEL보다 낮은 사용자를 True 표시 
    none_rating_idx = item_ratings[no_rating | low_significance].index # no_rating이 True이거나 low_significance가 True인 사용자 표시
    
    item_ratings = item_ratings.dropna()
    sim_scores = sim_scores.drop(none_rating_idx)
    
    if neighbor_size == 0:
      prediction = np.dot(sim_scores, item_ratings) / sim_scores.sum()
      prediction = prediction + rating_mean[session_id]

    else:
      if len(sim_scores) > MIN_RATINGS: # 해당 아이템 rating 유저수가 MIN_RATINGS보다 큰 경우 예측값 계산
        neighbor_size = min(neighbor_size, len(sim_scores))
        sim_scores = np.array(sim_scores)
        item_ratings = np.array(item_ratings)
        user_idx = np.argsort(sim_scores)
        sim_scores = sim_scores[user_idx][-neighbor_size:]
        item_ratings = item_ratings[user_idx][-neighbor_size:]
        prediction = np.dot(sim_scores, item_ratings) / sim_scores.sum()
        prediction = prediction + rating_mean[session_id]
      else:
        prediction = rating_mean[session_id]

  else:
    prediction = rating_mean[session_id]
  
  # 이 부분 추가
  if prediction < 1.0:	
    prediction = 1.0
  elif prediction > 4.0:
    prediction = 4.0
    
  return prediction
  
SIG_LEVEL = 3
MIN_RATINGS = 2
print('최종 예측 값 수정한 협업 필터링 방식의 정확도:', score(CF_knn_bias_sig_adj, 30))

최종 예측 값 수정한 협업 필터링 방식의 정확도: 0.848198347322833
