In [7]:
#import data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_row', 50)

train = pd.read_csv('../data/train_ratings.csv')
test = pd.read_csv('../data/test_ratings.csv')
print(train.shape)
print(test.shape)

def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

SEED = 42
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything(SEED)

(306795, 3)
(76699, 3)


In [71]:
n = 5 # n은 other로 바꾸는 기준

tem = train['user_id'].value_counts()
tem = list(tem[tem >= n].index)
train['id'] = train['user_id'].copy()
train['id'][~train['id'].isin(tem)] = -1
test['id'] = test['user_id'].copy()
test['id'][~test['id'].isin(tem)] = -1

tem = train['isbn'].value_counts()
tem = list(tem[tem >= n].index)
train['bn'] = train['isbn'].copy()
train['bn'][~train['bn'].isin(tem)] = 1000000000
test['bn'] = test['isbn'].copy()
test['bn'][~test['bn'].isin(tem)] = 1000000000

train['id'] = train['id'].astype('str')
train['bn'] = train['bn'].astype('str')

test['id'] = test['id'].astype('str')
test['bn'] = test['bn'].astype('str')

In [72]:
# 유저만 others, 중앙값 평균 차이가 크지 않음. 1이 상대적으로 별로 없다.
_median = train[(train['id'] == '-1')]['rating'].median()
_mean = train[(train['id'] == '-1')]['rating'].mean()

print(f'중앙값은 : {np.round(_median,5)}, 평균은 : {np.round(_mean,5)}')

중앙값은 : 7.0, 평균은 : 6.86547


In [73]:
# 책만 others, 1점을 준 비중이 생각보다 많다.
_median = train[(train['bn'] == '1000000000')]['rating'].median()
_mean = train[(train['bn'] == '1000000000')]['rating'].mean()

print(f'중앙값은 : {np.round(_median,5)}, 평균은 : {np.round(_mean,5)}')

중앙값은 : 8.0, 평균은 : 6.94185


In [74]:
# 유저는 others, 책도 others
_median = train[(train['id'] == '-1') & (train['bn'] == '1000000000')]['rating'].median()
_mean = train[(train['id'] == '-1') & (train['bn'] == '1000000000')]['rating'].mean()

print(f'중앙값은 : {np.round(_median,5)}, 평균은 : {np.round(_mean,5)}')

중앙값은 : 7.0, 평균은 : 6.66446


In [76]:
# 유저는 not others, 책은 others
_median = train[~(train['id'] == '-1') & (train['bn'] == '1000000000')]['rating'].median()
_mean = train[~(train['id'] == '-1') & (train['bn'] == '1000000000')]['rating'].mean()

print(f'중앙값은 : {np.round(_median,5)}, 평균은 : {np.round(_mean,5)}')

중앙값은 : 8.0, 평균은 : 7.01491


In [77]:
# 유저는 not others, 책도 not others
_median = train[~(train['id'] == '-1') & ~(train['bn'] == '1000000000')]['rating'].median()
_mean = train[~(train['id'] == '-1') & ~(train['bn'] == '1000000000')]['rating'].mean()

print(f'중앙값은 : {np.round(_median,5)}, 평균은 : {np.round(_mean,5)}')

중앙값은 : 8.0, 평균은 : 7.28676


In [86]:
tem = train[(train['id'] == '-1') & (train['bn'] == '1000000000')]
tem['pred_rating'] = 5.5
rmse(tem['rating'], tem['pred_rating'])

2.7873994060412275

In [80]:
tem['rating'].mean()

6.664463854729341

In [95]:
train[(train['id'] == '-1') & (train['bn'] == '1000000000')]['rating'].value_counts()

8     7918
7     6532
9     4212
10    3774
6     3447
1     2222
5     2092
4     1664
2     1615
3     1328
Name: rating, dtype: int64