In [1]:
import os
import argparse
import pandas as pd
import math
from tqdm import tqdm

In [2]:
parser = argparse.ArgumentParser(description="parser")
arg = parser.add_argument

arg("--file_path", "-fp", type=str, default="./ensembles/", help='앙상블 파일 열기 경로')
arg("--result_path", "-rp", type=str, default="./submit/", help='앙상블 결과 저장 경로')
arg("--files", "-f", nargs="+", default="ease.csv,admmslim.csv,cdae.csv,deepfm.csv,lightgcn.csv,multidae.csv,multivae.csv,sasrec.csv", type=lambda s: [item for item in s.split(",")], help="앙상블 파일명(쉼표 구분)")
arg("--weight", "-w", nargs="+", default="5,3,1,1,1,1,1,1", type=lambda s: [float(item) for item in s.split(",")], help="앙상블 모델 가중치 설정")

args = parser.parse_args([])


In [3]:
os.makedirs(args.file_path, exist_ok=True)  # 읽어들일 파일 경로
os.makedirs(args.result_path, exist_ok=True)  # 결과 파일 저장 경로

In [4]:
args.weight

[5.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

In [5]:
output_path = [args.file_path + file_name for file_name in args.files]  # 앙상블 할 파일 경로 리스트
output_list = [pd.read_csv(path) for path in output_path]  # 앙상블 할 파일 리스트
output_frame = pd.read_csv(output_path[0]).iloc[:, 0]

In [6]:
user_list = output_frame.unique().tolist()

In [7]:
weight_list = []
for weight in args.weight:
    weight = weight
    weight_list.append(weight)

In [8]:
concat_df = pd.DataFrame(columns=['user', 'item'])
# 0번파일, 1번파일을 rank로 정렬하여 weight_list 비율로 뽑는다.
for output in output_list:
    concat_df = pd.concat([concat_df, output], axis=0)

In [9]:
# duplication_list = []
concat_df['count'] = 1

In [10]:
concat_df = concat_df.groupby(['user', 'item']).count().reset_index()
concat_df = concat_df[concat_df['count'] > len(output_list) - 1]

In [11]:
concat_df

Unnamed: 0,user,item,count
232,43,4973,8
253,50,778,8
380,61,2959,8
738,98,4993,8
744,98,7153,8
...,...,...,...
1201172,138473,318,8
1201176,138473,593,8
1201188,138473,7361,8
1201270,138492,296,8


In [12]:
concat_df = concat_df.groupby('user')['item'].apply(list).reset_index()
temp_list = []
for i in range(len(output_list)):
    temp_list.append(output_list[i].groupby('user')['item'].apply(list).reset_index())

In [13]:
concat_df
temp_list

[         user                                               item
 0          11  [4886, 4370, 47, 40815, 2, 33004, 7438, 7373, ...
 1          14  [1907, 4963, 1198, 2011, 919, 1223, 4857, 588,...
 2          18  [46578, 2692, 1193, 296, 2571, 50, 8873, 5995,...
 3          25  [7153, 1270, 2762, 47, 1073, 1259, 2997, 1923,...
 4          31  [6377, 8360, 34405, 79132, 2628, 68954, 60040,...
 ...       ...                                                ...
 31355  138473  [48394, 5952, 593, 7438, 318, 50, 7361, 356, 7...
 31356  138475  [2203, 2726, 930, 3307, 1223, 8491, 5291, 942,...
 31357  138486  [5679, 8957, 1265, 8368, 588, 1097, 8636, 377,...
 31358  138492  [2502, 5618, 2918, 3421, 296, 608, 260, 1265, ...
 31359  138493  [2628, 32587, 8970, 8961, 110, 5349, 551, 4022...
 
 [31360 rows x 2 columns],
          user                                               item
 0          11  [40815, 4886, 7373, 8961, 2, 32587, 7438, 4370...
 1          14  [5816, 4016, 2398, 52435, 1907,

In [14]:
final_list = []
for user in tqdm(user_list):
    confirm_list = concat_df[concat_df['user'] == user]['item'].values
    if len(confirm_list) > 0:
        confirm_list = concat_df[concat_df['user'] == user]['item'].values[0]
        decision_list = []
        for temp in temp_list:
            item = temp[temp['user'] == user]['item'].values[0]
            for confirm in confirm_list:
                if confirm in item:
                    item.remove(confirm)
            for weight in weight_list:
                for i in range(len(item)):
                    decision_list.append([item[i], weight])
        temp_dic = {}
        for k, v in decision_list:
            if k in temp_dic:
                temp_dic[k] = temp_dic[k] + v
            else:
                temp_dic[k] = v
        item_list = [[k, v] for k, v in temp_dic.items()]
        item_list.sort(key=lambda x:-x[1])
        item_list = [k for k, v in item_list]
        final_list.append([user, (confirm_list + item_list)[:10]])
    else:
        decision_list = []
        for temp in temp_list:
            item = temp[temp['user'] == user]['item'].values[0]
            for weight in weight_list:
                for i in range(len(item)):
                    decision_list.append([item[i], weight])
        temp_dic = {}
        for k, v in decision_list:
            if k in temp_dic:
                temp_dic[k] = temp_dic[k] + v
            else:
                temp_dic[k] = v
        item_list = [[k, v] for k, v in temp_dic.items()]
        item_list.sort(key=lambda x:-x[1])
        item_list = [k for k, v in item_list]
        final_list.append([user, item_list[:10]])

100%|██████████| 31360/31360 [01:49<00:00, 286.68it/s]


In [17]:
df = pd.DataFrame(final_list, columns=['user', 'item'])

In [28]:
export = df.explode('item').reset_index(drop=True)

In [29]:
export

Unnamed: 0,user,item
0,11,4886
1,11,4370
2,11,2
3,11,8961
4,11,40815
...,...,...
313595,138493,47
313596,138493,2011
313597,138493,2012
313598,138493,7147


In [30]:
export.to_csv(os.path.join(args.result_path, 'submit_data_weight.csv'), index=False)