## 데이터 불러오기

In [1]:
import numpy as np
import pandas as pd

import os

import warnings
warnings.filterwarnings('ignore')

path = '/opt/ml/input/data/train/'

train = pd.read_csv(path + 'train_ratings.csv')

year_data = pd.read_csv(os.path.join(path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(path, 'directors.tsv'), sep='\t')

## 베이스라인 시작 전 해야하는 train_new 파일 및 item2attributes 생성 코드

In [2]:
# 모델 시작 전 인코딩하는 과정입니다.
user2idx = {user:idx for idx, user in enumerate(train['user'].unique())}
idx2user = {idx:user for idx, user in enumerate(train['user'].unique())}

## 패딩 때문에 0 비워놓음
item2idx = {item:(idx+1) for idx, item in enumerate(train['item'].unique())}
idx2item = {(idx+1):item for idx, item in enumerate(train['item'].unique())}

genre2idx = {genre:(idx) for idx, genre in enumerate(genre_data['genre'].unique())}
idx2genre = {(idx):genre for idx, genre in enumerate(genre_data['genre'].unique())}

train_new = train.copy()
genre_data_new = genre_data.copy()

train_new['user'] = train_new['user'].map(user2idx)
train_new['item'] = train_new['item'].map(item2idx)

genre_data_new['item'] = genre_data_new['item'].map(item2idx)
genre_data_new['genre'] = genre_data_new['genre'].map(genre2idx)
train_new.to_csv(os.path.join(path, 'train_new.csv'), index = False)
genre_data_new.to_csv(os.path.join(path, 'genre_data_new.csv'), index = False)

import json
genre_dic = genre_data_new.groupby('item')['genre'].apply(list).to_dict()

with open(os.path.join(path, 'item2attributes.json'),'w') as f:
    json.dump(genre_dic, f)#, indent=4)

## 모델 학습 후 submission 값 디코딩하는 코드

In [6]:
# 모델 학습 후 디코딩하는 과정입니다.
out_path = './output'
output = pd.read_csv(os.path.join(out_path, 'submission.csv'))
output['user'] = output['user'].map(idx2user)
output['item'] = output['item'].map(idx2item)
output.to_csv(os.path.join(out_path, '0103_submission_2.csv'), index = False)

## 앙상블 하는 코드

In [6]:
# 베이스라인 저장된 csv 호출
baseline1 = pd.read_csv(os.path.join(out_path, '1231_submission_5.csv')) # 1231_submission_5, 0102_submission_1
baseline2 = pd.read_csv(os.path.join(out_path, '1231_submission_4.csv')) # 1231_submission_4, 0102_submission_2

# ease 모델 저장된 csv 호출
ease = pd.read_csv(os.path.join(out_path, 'ease_500_20.csv'))

In [9]:
out_path = './output'

baseline1 = pd.read_csv(os.path.join(out_path, '0102_submission_1.csv')) # 1231_submission_5
baseline2 = pd.read_csv(os.path.join(out_path, '0102_submission_2.csv')) # 1231_submission_4

baseline1['tem'] = 2
baseline2['tem'] = 2
baseline1['seq'] = baseline1.groupby('user')['tem'].apply(lambda x : x.cumsum())
baseline2['seq'] = baseline2.groupby('user')['tem'].apply(lambda x : x.cumsum())
baseline1['seq'] = baseline1['seq'] - 1 # ease를 한 단계 높게 처주기 위해.
baseline = pd.concat([baseline1, baseline2])
baseline = baseline.groupby(['user','item'])['seq'].agg(['size','sum']).reset_index()
baseline = baseline.sort_values(['user','size','sum'], ascending=[True, False, True]).reset_index(drop = True)

In [10]:
baseline_concat3 = baseline.copy() # baseline_concat1

In [11]:
baseline_concat = pd.concat([baseline_concat1,  baseline_concat2, baseline_concat3]) # baseline_concat4,
baseline = baseline_concat.groupby(['user','item'])['size','sum'].agg(['sum']).reset_index()
baseline

Unnamed: 0_level_0,user,item,size,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,sum
0,11,293,1,19
1,11,3949,3,50
2,11,3996,1,20
3,11,4886,6,77
4,11,8950,2,7
...,...,...,...,...
719456,138493,68358,1,12
719457,138493,69140,1,12
719458,138493,69746,2,40
719459,138493,69844,2,26


In [12]:
baseline.columns = ['user','item','size','sum']
baseline = baseline.sort_values(['user','size','sum'], ascending=[True, False, True]).reset_index(drop = True)
baseline

Unnamed: 0,user,item,size,sum
0,11,72998,6,29
1,11,68237,6,33
2,11,48780,6,45
3,11,70286,6,55
4,11,4886,6,77
...,...,...,...,...
719456,138493,68358,1,12
719457,138493,69140,1,12
719458,138493,57640,1,15
719459,138493,32,1,17


In [13]:
# tem : seq 만들어주기 위한 도구
# seq : 랭킹을 매기기 위한 값. 낮을 수록 더 유망한 것.
baseline['tem'] = 2
ease['tem'] = 2
baseline['seq'] = baseline.groupby('user')['tem'].apply(lambda x : x.cumsum())
ease['seq'] = ease.groupby('user')['tem'].apply(lambda x : x.cumsum())
ease['seq'] = ease['seq'] - 1 # ease를 한 단계 높게 처주기 위해.
baseline['seq'] = baseline['seq']# (+ 2) : 10개 중 baseline 4개만 반영하기 위해 +2, +4 등 조치 취함.

In [14]:
baseline = baseline.groupby('user').apply(lambda x : x[:20]).reset_index(drop = True)
baseline

Unnamed: 0,user,item,size,sum,tem,seq
0,11,72998,6,29,2,2
1,11,68237,6,33,2,4
2,11,48780,6,45,2,6
3,11,70286,6,55,2,8
4,11,4886,6,77,2,10
...,...,...,...,...,...,...
606415,138493,3213,1,7,2,32
606416,138493,68358,1,12,2,34
606417,138493,69140,1,12,2,36
606418,138493,57640,1,15,2,38


In [15]:
# 베이스라인과 ease 합침.
final = pd.concat([baseline, ease])
# 베이스라인과 ease에서 모두 추천하는 영화 찾기 위한 코드.
# 두 모델에서 모두 추천하는 영화는 0순위로 놓기로 함
final['seq'][final.duplicated(['user','item'], False)] = 0 # 0 : 0순위.
# 이후 중복 제거
final = final.drop_duplicates(['user','item'])
# 유저 단위로, seq가 낮을 수록 더 높은 순위에 추천이기 때문에 이렇게 함.
final = final.sort_values(['user','seq']).reset_index(drop = True)
# 상위 10개만 추림
final = final.groupby('user').apply(lambda x : x[:10]).reset_index(drop = True)
final[['user','item']].to_csv(os.path.join(out_path, '0104ensemble.csv'), index = False)

In [24]:
final[['user','item']]

Unnamed: 0,user,item
0,11,4886
1,11,8961
2,11,3996
3,11,4370
4,11,72998
...,...,...
313595,138493,110
313596,138493,33794
313597,138493,27155
313598,138493,8961


In [65]:
baseline = baseline.groupby('user').apply(lambda x : x[:10]).reset_index(drop = True)
baseline[['user','item']].to_csv(os.path.join(out_path, '0101base.csv'), index = False)

In [42]:
tem = pd.concat([baseline1, baseline2])
tem['seq'][tem.duplicated(['user','item'], False)] = 0
tem

Unnamed: 0,user,item,tem,seq
0,11,70286,2,0
1,11,48780,2,0
2,11,72998,2,0
3,11,68237,2,0
4,11,68954,2,0
...,...,...,...,...
313595,138493,33794,2,0
313596,138493,58559,2,0
313597,138493,1562,2,16
313598,138493,153,2,18


In [48]:
tem[(tem['seq'] == 0)]['user'].value_counts()

75677    20
96172    20
38413    20
98604    20
48346    20
         ..
9795      2
96341     2
85494     2
43343     2
69787     2
Name: user, Length: 31334, dtype: int64

In [37]:
baseline

Unnamed: 0,user,item,tem,seq
0,11,70286,2,2
1,11,48780,2,4
2,11,72998,2,6
3,11,68237,2,8
4,11,68954,2,10
...,...,...,...,...
450118,138493,57640,2,20
450119,138493,1562,2,22
450120,138493,60074,2,24
450121,138493,153,2,26
