# Data import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from datetime import datetime, date
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [60]:
FILES_DIR = './files/'
Eco_df = pd.read_csv(FILES_DIR + 'Eco_df.csv')

In [61]:
session_id = [(list(set(x['item_no'].values))[0], '/'.join(x['session_id'].values)) for index, x in Eco_df.groupby('item_no')]
session_id_df = pd.DataFrame(data=session_id, columns=['item_no', 'session_id'])
session_id_df = session_id_df.set_index('item_no')

user_no = [(list(set(x['item_no'].values))[0], '/'.join(x['user_no'].values)) for index, x in Eco_df.groupby('item_no')]
user_no_df = pd.DataFrame(data=user_no, columns=['item_no', 'user_no'])
user_no_df = user_no_df.set_index('item_no')

total_df = pd.merge(session_id_df, user_no_df, on='item_no', how='inner')

In [62]:
dummy1 = total_df['session_id'].str.get_dummies(sep=',')
dummy2 = total_df['user_no'].str.get_dummies(sep=',')
# dummy2 = label.fit(total_df['user_no']), label.transform(total_df['user_no']), total_df['user_no'].str.get_dummies(sep=',') 
        #  pd.get_dummies(total_df['user_no'])   

In [63]:
train_df, test_df = train_test_split(Eco_df, test_size=0.25, random_state=42)

print(train_df.shape)
print(test_df.shape)

y_train = train_df['rating']
print(y_train.shape)

(12276, 33)
(4093, 33)
(12276,)


# session_id

In [64]:
X_train = pd.concat([pd.get_dummies(train_df['session_id'], prefix='session_id'),
           train_df['item_no'].apply(lambda x: dummy1.loc[x])], axis=1)
        #    train_df['item_no'].apply(lambda x: dummy2.loc[x])], axis=1)

In [65]:
import scipy
X_train_sparse = scipy.sparse.csr_matrix(X_train.values)

In [66]:
rating_matrix = X_train_sparse
# 아이템 기반 유사도 측정
rating_matrix_t = rating_matrix #np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy()#.fillna(0)
item_similarity = pd.DataFrame(cosine_similarity(matrix_dummy))

def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model):
  id_pairs = zip(test_df['session_id'], test_df['item_no'])
  y_pred = np.array([model(user, item) for (user, item) in id_pairs])
  y_true = np.array(test_df['rating'])
  return RMSE(y_true, y_pred)

def CF_IBCF(session_id, item_no):
  if item_no in item_similarity:
    sim_scores = item_similarity[item_no]
    user_rating = rating_matrix_t[session_id]
    non_rating_idx = user_rating[user_rating.isnull()].index
    user_rating = user_rating.dropna()
    sim_scores = sim_scores.drop(non_rating_idx)
    mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
  else:
    mean_rating = 0.8
  
  return mean_rating
  
print('아이템 기반 협업 필터링 방식의 정확도:', score(CF_IBCF))

아이템 기반 협업 필터링 방식의 정확도: 0.6587919557401715


# user_no

In [29]:
X_train = pd.concat([pd.get_dummies(train_df['user_no'], prefix='user_no'),
        #    train_df['item_no'].apply(lambda x: dummy1.loc[x]),
           train_df['item_no'].apply(lambda x: dummy2.loc[x])], axis=1)

In [30]:
import scipy
X_train_sparse = scipy.sparse.csr_matrix(X_train.values)

In [53]:
X_train

Unnamed: 0,session_id_00036f34a8a43d56a24aeafce93ce4ee,session_id_0009887462f827937ffcc5c644c4ce4a,session_id_000bafe660caa7884f8fba35b567fe06,session_id_000c1875b89b0801657a4bf930c614a3,session_id_000d4f3b76b2481a36a69eacd2bb468c,session_id_00155342bc3091d014dcc15199e6ae14,session_id_0016dcc5c5445d3093f992fb9bbee99b,session_id_0019118eb9a9373a7478f1034b235b9c,session_id_001b165542f9aba4f28ce338e77a43c8,session_id_0027a2b2bdb13d6f0e716e74b92640b6,...,12277,12278,12279,12280,12281,12282,12283,12284,12285,12286
10436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16185,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11859,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5664,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15795,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
item_similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12266,12267,12268,12269,12270,12271,12272,12273,12274,12275
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [31]:
rating_matrix = X_train_sparse
# 아이템 기반 유사도 측정
rating_matrix_t = rating_matrix #np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy()#.fillna(0)
item_similarity = pd.DataFrame(cosine_similarity(matrix_dummy))

def RMSE(y_true, y_pred):
  return np.sqrt(np.mean(np.power(np.array(y_true)-np.array(y_pred), 2)))

def score(model):
  id_pairs = zip(test_df['user_no'], test_df['item_no'])
  y_pred = np.array([model(user, item) for (user, item) in id_pairs])
  y_true = np.array(test_df['rating'])
  return RMSE(y_true, y_pred)

def CF_IBCF(user_no, item_no):
  if item_no in item_similarity:
    sim_scores = item_similarity[item_no]
    user_rating = rating_matrix_t[user_no]
    non_rating_idx = user_rating[user_rating.isnull()].index
    user_rating = user_rating.dropna()
    sim_scores = sim_scores.drop(non_rating_idx)
    mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
  else:
    mean_rating = 0.8
  
  return mean_rating
  
print('아이템 기반 협업 필터링 방식의 정확도:', score(CF_IBCF))

아이템 기반 협업 필터링 방식의 정확도: 0.6587919557401715
