In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from datetime import datetime, date
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
FILES_DIR = './files/'
total = pd.read_csv(FILES_DIR + 'total.csv')



In [4]:
male = total[total['gender'] == 'M'] # 남자고객데이터
female = total[total['gender'] == 'F'] # 여자고객데이터
android = total[total['age'] == 20] # 20대
ios = total[total['age'] == 30] # 30대

In [5]:
group_data = pd.DataFrame({'count': total.groupby(['session_id', 'user_no', 'item_no', 'gender', 'platform']).size()})
group_data = group_data.reset_index()

group_data2 = total.groupby('session_id')['rating'].mean().reset_index()

group_data3 = pd.merge(group_data, group_data2, on='session_id', how='inner')

group_data3 = group_data3

group_data = group_data3.sort_values(by='rating', ascending=False)

# group_data = group_data.drop_duplicates(subset='session_id', keep='first')

group_data = group_data[group_data['count'] == 1]

group_data

Unnamed: 0,session_id,user_no,item_no,gender,platform,count,rating
0,00047f91f7624d57ae28662b751624db,hoEZFZeX2JUz9+RmAgvubQ==,iP6hnNonEYCWvtV/wsjilA==,F,ANDROID,1,4.0
12100,bba908b4a15aec7237d3f81e93af4eec,X2EtZNswNUi0NQ811Lk0ZA==,Gx7FQgogekHlIMM3ELkHtQ==,F,IOS,1,4.0
15410,ed936abf233af281ff9a6a93152fd9c6,1PZIaHU/KKbfd+DlVyj0RA==,0whP9oGO0f1EHkVqP4gxfg==,F,IOS,1,4.0
1498,179804d87499ad125855bda4a5e9309e,Q7Mc9PJSqgCI5ozZOsc3HA==,6nccKk24PQkrfPhw7cTsOQ==,M,IOS,1,4.0
12139,bc354cf7b5f838d2b1e8a51a0282acdd,/A0eX40c7U/3uiLG/akkTA==,18N28EpFbjcDiUeef3TPug==,F,ANDROID,1,4.0
...,...,...,...,...,...,...,...
6126,5fbfa5744a5b91041a6eeb4ecc15111b,laoJc6XwqswFhQ+6BB0p2A==,MWgv/9pRTxstSyMM2dF9dA==,F,IOS,1,1.0
6127,5fc33d9111e1105614bd8edbdf3f18b2,ccQ8/m6yuvjZKiWD+/KHww==,KGADz14c9pHc+BuwReOQBw==,F,IOS,1,1.0
6128,5fc39b28f229c9b49342e5befce306c9,j1y/B/jOUQd0mb2vyQIvBQ==,y8glepMabn1ukjk43CJWCg==,F,IOS,1,1.0
6129,5fc4f901e508009d863fc365fac5f061,n6dD9L7YfVfC+vxvIk1eMQ==,up5cIpRinQ618fkewQzbWg==,F,IOS,1,1.0


In [6]:
# train, test split
x = group_data.copy()
y = group_data['session_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# x_train.drop_duplicates(subset=['session_id', 'item_no'], inplace=True)


def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model, neighbor_size=0): # neighbor_size 파라미터 추가
  id_pairs = zip(x_test['session_id'], x_test['item_no'])
  y_pred = np.array([model(user, item, neighbor_size) for (user, item) in id_pairs])
  y_true = np.array(x_test['rating'])
  return RMSE(y_true, y_pred)


In [7]:
rating_matrix = group_data.pivot(index='session_id', columns='item_no', values='rating')
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = pd.DataFrame(cosine_similarity(matrix_dummy), index=rating_matrix.index, columns=rating_matrix.index)

In [8]:
def CF_knn(session_id, item_no, neighbor_size=0):
  if item_no in rating_matrix:
    sim_scores = user_similarity[session_id].copy()
    itme_ratings = rating_matrix[item_no].copy()
    none_rating_idx = itme_ratings[itme_ratings.isnull()].index 
    itme_ratings = itme_ratings.dropna()
    sim_scores = sim_scores.drop(none_rating_idx)
    
    if neighbor_size == 0:
      mean_rating = np.dot(sim_scores, itme_ratings) / sim_scores.sum()
    else:
      if len(sim_scores) > 1:
        neighbor_size = min(neighbor_size, len(sim_scores))
        sim_scores = np.array(sim_scores)
        itme_ratings = np.array(itme_ratings)
        user_idx = np.argsort(sim_scores)
        sim_scores = sim_scores[user_idx][-neighbor_size:]
        itme_ratings = itme_ratings[user_idx][-neighbor_size:]
        mean_rating = np.dot(sim_scores, itme_ratings) / sim_scores.sum()
      else:
        mean_rating = 2.5

  else:
    mean_rating = 2.5
  
  return mean_rating

print('KNN 방식의 정확도:', score(CF_knn, neighbor_size=5))

KNN 방식의 정확도: 1.202229675207966


In [12]:
for neighbor_size in [10, 30, 60, 90, 120]:
  print(f"이웃의 크기 = {neighbor_size} : RMSE = {score(CF_knn, neighbor_size)}")

이웃의 크기 = 10 : RMSE = 1.2010954500687558
이웃의 크기 = 30 : RMSE = 1.2007245457327638
이웃의 크기 = 60 : RMSE = 1.2007419338606842
이웃의 크기 = 90 : RMSE = 1.2007419338606842
이웃의 크기 = 120 : RMSE = 1.2007419338606842
