In [7]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

# Dataload

In [8]:
data_path = '/opt/ml/input/data/train/'

In [9]:
context_train = pd.read_csv(data_path + 'context_train_data.csv')
context_valid = pd.read_csv(data_path + 'context_valid_data.csv')

In [10]:
context_train

Unnamed: 0,user,item,rating,director,writer,genre,year
0,10638,561,1,[1226],"[807, 1605, 1642]","[12, 14]",1965.0
1,23745,4646,1,[944],"[300, 1002, 1926]","[1, 6]",2005.0
2,24397,2661,1,[162],"[148, 394, 1123]","[13, 14, 15, 16]",2001.0
3,22251,1041,1,[121],[38],[8],1976.0
4,29046,3567,1,[204],"[184, 580, 2485, 2571]","[1, 2, 8, 9]",2003.0
...,...,...,...,...,...,...,...
10824201,31359,2099,0,[],[],[5],1985.0
10824202,31359,180,0,[142],[],[8],1994.0
10824203,31359,4144,0,[153],"[1486, 2149]","[5, 14]",1936.0
10824204,31359,4858,0,[],"[776, 1379]","[8, 17]",1995.0


In [11]:
context_valid

Unnamed: 0,user,item,rating,director,writer,genre,year
0,13755,4141,1,[1181],[2539],"[5, 8, 14]",1934.0
1,14279,5078,1,[457],"[161, 1229, 2073, 2553]","[1, 2, 16]",2006.0
2,13689,2955,1,[1178],"[1555, 2418, 2430]","[11, 13, 16]",2002.0
3,20616,5425,1,[1318],[2903],"[5, 6, 8, 16]",2008.0
4,22020,2672,1,[28],[1156],"[8, 14]",2001.0
...,...,...,...,...,...,...,...
1546337,11092,133,1,[1024],"[2081, 2514]","[1, 2, 15]",1995.0
1546338,8097,353,1,[17],"[18, 2666]","[1, 15]",1991.0
1546339,9981,4712,1,[50],"[51, 2437]","[1, 5, 6, 16]",1988.0
1546340,6279,1617,1,[1241],"[1660, 1915]","[1, 5]",1978.0


In [12]:
context_train = context_train.drop(['director', 'writer', 'genre'], axis=1)
context_valid = context_valid.drop(['director', 'writer', 'genre'], axis=1)

In [13]:
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
genre_data['item'] = LabelEncoder().fit_transform(genre_data.loc[:, 'item'])
genre_data = genre_data.groupby('item')['genre'].apply(list)
context_train = pd.merge(context_train, genre_data, on=['item'], how='left')
context_valid = pd.merge(context_valid, genre_data, on=['item'], how='left')
context_train

Unnamed: 0,user,item,rating,year,genre
0,10638,561,1,1965.0,"[Musical, Romance]"
1,23745,4646,1,2005.0,"[Action, Crime]"
2,24397,2661,1,2001.0,"[Mystery, Romance, Sci-Fi, Thriller]"
3,22251,1041,1,1976.0,[Drama]
4,29046,3567,1,2003.0,"[Action, Adventure, Drama, Fantasy]"
...,...,...,...,...,...
10824201,31359,2099,0,1985.0,[Comedy]
10824202,31359,180,0,1994.0,[Drama]
10824203,31359,4144,0,1936.0,"[Comedy, Romance]"
10824204,31359,4858,0,1995.0,"[Drama, War]"


In [14]:
context_train.isnull().sum()

user      0
item      0
rating    0
year      0
genre     0
dtype: int64

In [15]:
context_valid.isnull().sum()

user      0
item      0
rating    0
year      0
genre     0
dtype: int64

In [16]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

def multiLabel(df, column):
    # MultiLabelBinarizer 객체 생성
    mlb = MultiLabelBinarizer()

    # 원핫 인코딩 수행
    encoded_array = mlb.fit_transform(df[column])

    # 인코딩 결과를 데이터프레임으로 변환
    df_encoded = pd.DataFrame(encoded_array, columns=mlb.classes_)

    # 원본 데이터프레임과 인코딩된 데이터프레임을 합치기
    df_final = pd.concat([df, df_encoded], axis=1)

    return df_final

In [17]:
context_train = multiLabel(context_train, 'genre')
context_valid = multiLabel(context_valid, 'genre')

In [18]:
context_train.columns

Index(['user', 'item', 'rating', 'year', 'genre', 'Action', 'Adventure',
       'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [19]:
context_valid.columns

Index(['user', 'item', 'rating', 'year', 'genre', 'Action', 'Adventure',
       'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [20]:
X_train, y_train = context_train.drop(['rating', 'genre'], axis=1), context_train.rating
X_test, y_test = context_valid.drop(['rating', 'genre'], axis=1), context_valid.rating

## catboost train

In [21]:
from catboost import CatBoostRegressor

categorical_features = ['user', 'item', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
                        'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# CatBoostRegressor 모델 생성 및 학습
model = CatBoostRegressor(loss_function='RMSE', task_type="GPU")
model.fit(X_train, y_train, eval_set=(X_test, y_test), cat_features=categorical_features)

Learning rate set to 0.136401
0:	learn: 0.4460614	test: 0.6302507	best: 0.6302507 (0)	total: 108ms	remaining: 1m 48s
1:	learn: 0.4260214	test: 0.6008435	best: 0.6008435 (1)	total: 211ms	remaining: 1m 45s
2:	learn: 0.4104002	test: 0.5772446	best: 0.5772446 (2)	total: 313ms	remaining: 1m 44s
3:	learn: 0.3981137	test: 0.5579570	best: 0.5579570 (3)	total: 423ms	remaining: 1m 45s
4:	learn: 0.3887034	test: 0.5425757	best: 0.5425757 (4)	total: 533ms	remaining: 1m 46s
5:	learn: 0.3813959	test: 0.5284969	best: 0.5284969 (5)	total: 645ms	remaining: 1m 46s
6:	learn: 0.3758096	test: 0.5185831	best: 0.5185831 (6)	total: 754ms	remaining: 1m 46s
7:	learn: 0.3715268	test: 0.5090596	best: 0.5090596 (7)	total: 864ms	remaining: 1m 47s
8:	learn: 0.3682570	test: 0.5016053	best: 0.5016053 (8)	total: 976ms	remaining: 1m 47s
9:	learn: 0.3657694	test: 0.4964536	best: 0.4964536 (9)	total: 1.08s	remaining: 1m 47s
10:	learn: 0.3638666	test: 0.4914108	best: 0.4914108 (10)	total: 1.2s	remaining: 1m 47s
11:	learn: 0

<catboost.core.CatBoostRegressor at 0x7f93341a3400>

In [22]:
# 테스트 데이터에 대한 예측
y_pred = model.predict(X_test)

# 예측 결과 평가
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.170566491779853


## 후보군 item 불러오기

In [23]:
lambda_, top1 = 500, 20
ease_data = pd.read_csv(data_path + 'my_ease_{}_{}.csv'.format(lambda_, top1))

In [24]:
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
test_df = pd.merge(ease_data, year_data, on=['item'], how='left')

In [25]:
test_df

Unnamed: 0,user,item,year
0,11,4370,2001.0
1,11,4886,2001.0
2,11,40815,2005.0
3,11,47,1995.0
4,11,32587,2005.0
...,...,...,...
627195,138493,2324,1997.0
627196,138493,4963,2001.0
627197,138493,2174,1988.0
627198,138493,4720,2001.0


In [26]:
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
genre_data = genre_data.groupby('item')['genre'].apply(list)
test_df = pd.merge(test_df, genre_data, on=['item'], how='left')

In [27]:
test_df.isnull().sum()

user      0
item      0
year     36
genre     0
dtype: int64

In [28]:
test_df = multiLabel(test_df, 'genre')
test_df

Unnamed: 0,user,item,year,genre,Action,Adventure,Animation,Children,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,11,4370,2001.0,"[Adventure, Drama, Sci-Fi]",0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,11,4886,2001.0,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,1,1,1,0,...,1,0,0,0,0,0,0,0,0,0
2,11,40815,2005.0,"[Adventure, Fantasy, Thriller]",0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,11,47,1995.0,"[Mystery, Thriller]",0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,11,32587,2005.0,"[Action, Crime, Film-Noir, Mystery, Thriller]",1,0,0,0,0,1,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627195,138493,2324,1997.0,"[Comedy, Drama, Romance, War]",0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
627196,138493,4963,2001.0,"[Crime, Thriller]",0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
627197,138493,2174,1988.0,"[Comedy, Fantasy]",0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
627198,138493,4720,2001.0,"[Drama, Horror, Mystery, Thriller]",0,0,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0


In [29]:
ratings_df = pd.read_csv(os.path.join(data_path, 'train_ratings.csv'))
itemtolabel = ratings_df[['item']].copy()
usertolabel = ratings_df[['user']].copy()
usertolabel['user_real'] = usertolabel['user']
itemtolabel['item_real'] = itemtolabel['item']
usertolabel['user_label'] = LabelEncoder().fit_transform(usertolabel.loc[:, 'user'])
itemtolabel['item_label'] = LabelEncoder().fit_transform(itemtolabel.loc[:, 'item'])

In [30]:
usertolabel = usertolabel.drop_duplicates()
itemtolabel = itemtolabel.drop_duplicates()

In [31]:
usertolabel

Unnamed: 0,user,user_real,user_label
0,11,11,0
376,14,14,1
556,18,18,2
633,25,25,3
724,31,31,4
...,...,...,...
5153765,138473,138473,31355
5153828,138475,138475,31356
5153952,138486,138486,31357
5154089,138492,138492,31358


In [32]:
itemtolabel

Unnamed: 0,item,item_real,item_label
0,4643,4643,2505
1,170,170,109
2,531,531,319
3,616,616,368
4,2140,2140,1183
...,...,...,...
511456,7753,7753,3753
589875,93422,93422,6446
595119,6519,6519,3247
619589,8830,8830,4049


In [33]:
test_df = pd.merge(test_df, usertolabel, on=['user'], how='left')
test_df

Unnamed: 0,user,item,year,genre,Action,Adventure,Animation,Children,Comedy,Crime,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,user_real,user_label
0,11,4370,2001.0,"[Adventure, Drama, Sci-Fi]",0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,11,0
1,11,4886,2001.0,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,11,0
2,11,40815,2005.0,"[Adventure, Fantasy, Thriller]",0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,11,0
3,11,47,1995.0,"[Mystery, Thriller]",0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,11,0
4,11,32587,2005.0,"[Action, Crime, Film-Noir, Mystery, Thriller]",1,0,0,0,0,1,...,0,0,1,0,0,1,0,0,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627195,138493,2324,1997.0,"[Comedy, Drama, Romance, War]",0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,138493,31359
627196,138493,4963,2001.0,"[Crime, Thriller]",0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,138493,31359
627197,138493,2174,1988.0,"[Comedy, Fantasy]",0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,138493,31359
627198,138493,4720,2001.0,"[Drama, Horror, Mystery, Thriller]",0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,138493,31359


In [34]:
test_df = pd.merge(test_df, itemtolabel, on=['item'], how='left')

In [35]:
test_df

Unnamed: 0,user,item,year,genre,Action,Adventure,Animation,Children,Comedy,Crime,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,user_real,user_label,item_real,item_label
0,11,4370,2001.0,"[Adventure, Drama, Sci-Fi]",0,1,0,0,0,0,...,0,0,1,0,0,0,11,0,4370,2381
1,11,4886,2001.0,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,1,1,1,0,...,0,0,0,0,0,0,11,0,4886,2619
2,11,40815,2005.0,"[Adventure, Fantasy, Thriller]",0,1,0,0,0,0,...,0,0,0,1,0,0,11,0,40815,4790
3,11,47,1995.0,"[Mystery, Thriller]",0,0,0,0,0,0,...,1,0,0,1,0,0,11,0,47,41
4,11,32587,2005.0,"[Action, Crime, Film-Noir, Mystery, Thriller]",1,0,0,0,0,1,...,1,0,0,1,0,0,11,0,32587,4581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627195,138493,2324,1997.0,"[Comedy, Drama, Romance, War]",0,0,0,0,1,0,...,0,1,0,0,1,0,138493,31359,2324,1281
627196,138493,4963,2001.0,"[Crime, Thriller]",0,0,0,0,0,1,...,0,0,0,1,0,0,138493,31359,4963,2653
627197,138493,2174,1988.0,"[Comedy, Fantasy]",0,0,0,0,1,0,...,0,0,0,0,0,0,138493,31359,2174,1206
627198,138493,4720,2001.0,"[Drama, Horror, Mystery, Thriller]",0,0,0,0,0,0,...,1,0,0,1,0,0,138493,31359,4720,2541


In [36]:
test_df['user'] = test_df['user_label']
test_df['item'] = test_df['item_label']

In [37]:
test_df.head()

Unnamed: 0,user,item,year,genre,Action,Adventure,Animation,Children,Comedy,Crime,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,user_real,user_label,item_real,item_label
0,0,2381,2001.0,"[Adventure, Drama, Sci-Fi]",0,1,0,0,0,0,...,0,0,1,0,0,0,11,0,4370,2381
1,0,2619,2001.0,"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,1,1,1,0,...,0,0,0,0,0,0,11,0,4886,2619
2,0,4790,2005.0,"[Adventure, Fantasy, Thriller]",0,1,0,0,0,0,...,0,0,0,1,0,0,11,0,40815,4790
3,0,41,1995.0,"[Mystery, Thriller]",0,0,0,0,0,0,...,1,0,0,1,0,0,11,0,47,41
4,0,4581,2005.0,"[Action, Crime, Film-Noir, Mystery, Thriller]",1,0,0,0,0,1,...,1,0,0,1,0,0,11,0,32587,4581


In [38]:
test_df = test_df.drop('genre', axis=1)

In [39]:
test_final = test_df[['user', 'item', 'year', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                       'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]

## 테스트 데이터 예측

In [40]:
# 테스트 데이터에 대한 예측
y_pred = model.predict(test_final)

In [41]:
test_df['score'] = y_pred
test_df

Unnamed: 0,user,item,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Romance,Sci-Fi,Thriller,War,Western,user_real,user_label,item_real,item_label,score
0,0,2381,2001.0,0,1,0,0,0,0,0,...,0,1,0,0,0,11,0,4370,2381,0.892405
1,0,2619,2001.0,0,1,1,1,1,0,0,...,0,0,0,0,0,11,0,4886,2619,0.997442
2,0,4790,2005.0,0,1,0,0,0,0,0,...,0,0,1,0,0,11,0,40815,4790,0.944514
3,0,41,1995.0,0,0,0,0,0,0,0,...,0,0,1,0,0,11,0,47,41,0.939014
4,0,4581,2005.0,1,0,0,0,0,1,0,...,0,0,1,0,0,11,0,32587,4581,0.921234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627195,31359,1281,1997.0,0,0,0,0,1,0,0,...,1,0,0,1,0,138493,31359,2324,1281,0.881798
627196,31359,2653,2001.0,0,0,0,0,0,1,0,...,0,0,1,0,0,138493,31359,4963,2653,0.927229
627197,31359,1206,1988.0,0,0,0,0,1,0,0,...,0,0,0,0,0,138493,31359,2174,1206,0.915036
627198,31359,2541,2001.0,0,0,0,0,0,0,0,...,0,0,1,0,0,138493,31359,4720,2541,0.767432


In [42]:
test_df['user'] = test_df['user_real']
test_df['item'] = test_df['item_real']

In [43]:
test_df = test_df.sort_values(['user', 'score'], ascending=[True, False])
test_df

Unnamed: 0,user,item,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Romance,Sci-Fi,Thriller,War,Western,user_real,user_label,item_real,item_label,score
5,11,8961,2004.0,1,1,1,1,1,0,0,...,0,0,0,0,0,11,0,8961,4101,1.068095
6,11,7373,2004.0,1,1,0,0,0,0,0,...,0,0,0,0,0,11,0,7373,3663,1.055863
18,11,2617,1999.0,1,1,0,0,1,0,0,...,0,0,1,0,0,11,0,2617,1458,1.023648
1,11,4886,2001.0,0,1,1,1,1,0,0,...,0,0,0,0,0,11,0,4886,2619,0.997442
2,11,40815,2005.0,0,1,0,0,0,0,0,...,0,0,1,0,0,11,0,40815,4790,0.944514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
627181,138493,2628,1999.0,1,1,0,0,0,0,0,...,0,1,0,0,0,138493,31359,2628,1462,0.821077
627189,138493,53125,2007.0,1,1,0,0,1,0,0,...,0,0,0,0,0,138493,31359,53125,5233,0.785378
627186,138493,8970,2004.0,0,0,0,0,0,0,0,...,0,0,0,0,0,138493,31359,8970,4107,0.778359
627198,138493,4720,2001.0,0,0,0,0,0,0,0,...,0,0,1,0,0,138493,31359,4720,2541,0.767432


In [44]:
top_10_scores = test_df.groupby('user').apply(lambda x: x.nlargest(10, 'score')).reset_index(drop=True)
top_10_scores

Unnamed: 0,user,item,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Romance,Sci-Fi,Thriller,War,Western,user_real,user_label,item_real,item_label,score
0,11,8961,2004.0,1,1,1,1,1,0,0,...,0,0,0,0,0,11,0,8961,4101,1.068095
1,11,7373,2004.0,1,1,0,0,0,0,0,...,0,0,0,0,0,11,0,7373,3663,1.055863
2,11,2617,1999.0,1,1,0,0,1,0,0,...,0,0,1,0,0,11,0,2617,1458,1.023648
3,11,4886,2001.0,0,1,1,1,1,0,0,...,0,0,0,0,0,11,0,4886,2619,0.997442
4,11,40815,2005.0,0,1,0,0,0,0,0,...,0,0,1,0,0,11,0,40815,4790,0.944514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313595,138493,593,1991.0,0,0,0,0,0,1,0,...,0,0,1,0,0,138493,31359,593,356,0.940310
313596,138493,48394,2006.0,0,0,0,0,0,0,0,...,0,0,1,0,0,138493,31359,48394,5038,0.928507
313597,138493,4963,2001.0,0,0,0,0,0,1,0,...,0,0,1,0,0,138493,31359,4963,2653,0.927229
313598,138493,2174,1988.0,0,0,0,0,1,0,0,...,0,0,0,0,0,138493,31359,2174,1206,0.915036


In [45]:
top2 = 10
predict = top_10_scores[['user', 'item']]
predict.to_csv('output/2step_ease({}, {})_catboost_{}.csv'.format(lambda_, top1, top2), index=False)