# Cold Start

In [1]:
import json
import numpy as np
import pandas as pd

In [2]:
path = '/opt/ml/data/'

train = pd.read_csv(path + 'train_ratings.csv')
test = pd.read_csv(path + 'test_ratings.csv')

## 분석

In [3]:
train.head()

Unnamed: 0,user_id,isbn,rating
0,8,2005018,4
1,67544,2005018,7
2,123629,2005018,8
3,200273,2005018,8
4,210926,2005018,9


In [4]:
test.head()

Unnamed: 0,user_id,isbn,rating
0,11676,2005018,0
1,116866,2005018,0
2,152827,60973129,0
3,157969,374157065,0
4,67958,399135782,0


## Cold Start를 위한 명단 생성

### User

In [5]:
cold_start_user_list = set(test['user_id'].unique()) - set(train['user_id'].unique())

In [6]:
len(cold_start_user_list)

8266

### Book(ISBN)

In [7]:
cold_start_book_list = set(test['isbn'].unique()) - set(train['isbn'].unique())

In [8]:
len(cold_start_book_list)

19793

## Cold Start 경우를 위한 유저 별 평균 및 책 별 평균

### Book

In [9]:
# cold start
mean_isbn = {}
for isbn in train['isbn'].unique():
    mean_isbn[isbn] = train[train['isbn'] == isbn]['rating'].mean()

In [10]:
## 모든 책 평점들의 평균 계산한 값
with open('./isbn_mean.json', 'w') as file:
    json_string = json.dumps(mean_isbn, default=lambda o: o.__dict__, sort_keys=True, indent=2)
    file.write(json_string)

In [11]:
# # 작성된 책별 평균 데이터 Load
# with open('/opt/ml/code/isbn_mean.json', "r") as st_json:
#     mean_isbn = json.load(st_json)

### User

In [12]:
# cold start user mean
mean_uid = {}
for uid in train['user_id'].unique():
    mean_uid[int(uid)] = train[train['user_id'] == uid]['rating'].mean()

In [13]:
## 모든 유저 평점들의 평균 계산한 값
with open('./uid_mean.json', 'w') as file:
    json_string = json.dumps(mean_uid, default=lambda o: o.__dict__, sort_keys=True, indent=2)
    file.write(json_string)

In [14]:
with open('/opt/ml/code/uid_mean.json', "r") as st_json:
    mean_uid = json.load(st_json)

## User가 매긴 평점에 따라 Cold Start 적용하기

In [15]:
# 전체 학습 데이터 셋의 평점의 평균
total_mean = train['rating'].mean()
total_mean

7.069714304340032

In [None]:
# test 셋의 cold start user들의 해당 책 평점 채우기
for isbn in cold_start_book_list:
    target_df = test[test['isbn'] == isbn]
    for idx, (t_uid,t_isbn,t_rating) in target_df.iterrows():
        try:
            test.loc[(test['user_id'] == t_uid) & (test['isbn'] == t_isbn),'rating'] = mean_uid[str(t_uid)]
        except:
            test.loc[(test['user_id'] == t_uid) & (test['isbn'] == t_isbn),'rating'] = total_mean
test['rating'].nunique()

In [None]:
# test 셋의 cold start user들의 해당 책 평점 채우기
for uid in cold_start_user_list:
    target_df = test[test['user_id'] == uid]
    for idx, target in target_df.iterrows():
        try:
            test.loc[(test['user_id'] == target['user_id']) & (test['isbn'] == target['isbn']),'rating'] = mean_isbn[str(target['isbn'])]
        except KeyError:
            test.loc[(test['user_id'] == target['user_id']) & (test['isbn'] == target['isbn']),'rating'] = total_mean
test['rating'].nunique()

## Cold Start 인원들의 rating 결과 CSV 파일 생성

In [None]:
test['rating'].replace(0,np.nan,inplace=True)

In [None]:
test.to_csv('./test_rating_rule_based.csv')