In [21]:
import os
import argparse
import pandas as pd
import math
from tqdm import tqdm

In [22]:
parser = argparse.ArgumentParser(description="parser")
arg = parser.add_argument

arg("--file_path", "-fp", type=str, default="./ensembles/", help='앙상블 파일 열기 경로')
arg("--result_path", "-rp", type=str, default="./submit/", help='앙상블 결과 저장 경로')
arg("--files", "-f", nargs="+", default="submit_EASE_data_weight.csv,submit_MultiDAE_data_weight.csv", type=lambda s: [item for item in s.split(",")], help="앙상블 파일명(쉼표 구분)")
arg("--weight", "-w", nargs="+", default="0.5,0.5", type=lambda s: [float(item) for item in s.split(",")], help="앙상블 모델 가중치 설정")
arg("--strategy", "-s", type=str, default="soft", choices=["hard", "soft"], help='앙상블 전략 선택')

args = parser.parse_args([])

In [23]:
os.makedirs(args.file_path, exist_ok=True)  # 읽어들일 파일 경로
os.makedirs(args.result_path, exist_ok=True)  # 결과 파일 저장 경로

In [24]:
args.weight

[0.5, 0.5]

In [25]:
file_list = []
for filename in os.listdir(args.file_path):
    file_path = os.path.join(args.file_path, filename)
    if os.path.isfile(file_path):
        file_list.append(filename[:-4])

In [26]:
output_path = [args.file_path + file_name + ".csv" for file_name in file_list]  # 앙상블 할 파일 경로 리스트
output_list = [pd.read_csv(path) for path in output_path]  # 앙상블 할 파일 리스트
output_frame = pd.read_csv(output_path[0]).iloc[:, 0]

In [27]:
# item 데이터프레임과 item_score 데이터프레임을 각각 생성
df_item = pd.DataFrame()
df_item_score = pd.DataFrame()

weight_list = []
for weight in args.weight:
    weight = weight
    weight_list.append(weight)

In [28]:
# 0번파일, 1번파일을 rank로 정렬하여 weight_list 비율로 뽑는다.
for i in range(len(output_list)):
    output_list[i].columns = ['user', 'item', 'score']
    output_list[i]['rank'] = output_list[i].groupby('user')['score'].rank(ascending=False).astype(int)
    # output_list[i] = output_list[i].sort_values(by=['user', 'rank']).groupby('user').head(weight_list[i])
    output_list[i] = output_list[i].sort_values(by=['user', 'rank']).groupby('user').head(10)

In [29]:
user_list = output_list[0]['user'].unique()
output_list

[          user   item     score  rank
 4           11  37386  4.509069     1
 18          11   4370  4.127007     2
 9           11      2  3.983931     3
 17          11   4886  3.919075     4
 3           11   3986  3.860288     5
 ...        ...    ...       ...   ...
 627198  138493   5349  3.672185     6
 627197  138493   2467  3.624843     7
 627193  138493   4720  3.608522     8
 627191  138493     47  3.584569     9
 627195  138493    589  3.579309    10
 
 [313600 rows x 4 columns],
           user   item     score  rank
 8           11   4370  0.929520     1
 9           11   4886  0.847820     2
 11          11  40815  0.706278     3
 10          11  32587  0.688193     4
 13          11     47  0.670851     5
 ...        ...    ...       ...   ...
 627189  138493   5349  0.608948     6
 627188  138493  32587  0.584563     7
 627182  138493   8970  0.553660     8
 627195  138493  48394  0.551906     9
 627196  138493   4022  0.543722    10
 
 [313600 rows x 4 columns]]

In [30]:
merge_df = output_list[0]
for i in range(len(output_list)):
    merge_df = merge_df.merge(output_list[i], on=['user', 'item'])[['user', 'item']]
    output_list[i] = output_list[i][['user', 'item']]

In [31]:
df_pivoted = merge_df.groupby('user')['item'].apply(list).reset_index()
temp_list = []
for i in range(len(output_list)):
    temp_list.append(output_list[i].groupby('user')['item'].apply(list).reset_index())

In [32]:
df_pivoted
temp_list[1]

Unnamed: 0,user,item
0,11,"[4370, 4886, 40815, 32587, 47, 8961, 7373, 2, ..."
1,14,"[1198, 2011, 1223, 919, 4857, 588, 1907, 4963,..."
2,18,"[1193, 296, 4235, 2324, 5995, 8873, 50, 2571, ..."
3,25,"[1270, 7153, 2762, 1073, 47, 1259, 2997, 1, 19..."
4,31,"[6377, 34405, 79132, 8360, 91542, 68358, 68954..."
...,...,...
31355,138473,"[5952, 7438, 778, 593, 7361, 48394, 50, 356, 3..."
31356,138475,"[930, 2203, 2726, 3307, 1945, 5291, 8491, 8228..."
31357,138486,"[1097, 8368, 589, 5679, 8957, 588, 8636, 377, ..."
31358,138492,"[260, 296, 2502, 608, 3897, 8961, 2918, 5618, ..."


In [33]:
for user in tqdm(user_list):
    item_list = []
    pivot_list = df_pivoted[df_pivoted['user'] == user]['item'].values
    if len(pivot_list) > 0:
        pivot_count = len(pivot_list[0])
        item_list = pivot_list[0]
        for i in range(len(temp_list)):
            add_count = math.ceil((10 - pivot_count) * weight_list[i])
            temp = temp_list[i][temp_list[i]['user'] == user]['item'].values[0]
            for idx in range(len(item_list)):
                if item_list[idx] in temp:
                    temp.remove(item_list[idx])
            item_list += temp[:add_count]
    else:
        for i in range(len(temp_list)):
            add_count = int(weight_list[i] * 10)
            temp = temp_list[i][temp_list[i]['user'] == user]['item'].values[0]
            item_list += temp[:add_count]
        df_pivoted = pd.concat([df_pivoted, pd.DataFrame([[user, item_list]], columns=['user', 'item'])], axis=0)

100%|██████████| 31360/31360 [00:31<00:00, 1011.03it/s]


In [34]:
df_pivoted

Unnamed: 0,user,item
0,11,"[4370, 2, 4886, 37386, 3986, 2054, 8861, 40815..."
1,14,"[1198, 1907, 919, 4016, 6539, 2398, 2762, 2011..."
2,18,"[46578, 4235, 296, 1193, 1446, 912, 27815, 232..."
3,25,"[1259, 1270, 2997, 1073, 47, 2762, 2291, 608, ..."
4,31,"[79132, 68954, 7454, 5882, 70336, 5313, 6377, ..."
...,...,...
0,137202,"[8125, 8167, 2935, 25866, 1934, 41997, 64839, ..."
0,137280,"[3200, 6773, 3606, 1394, 5022, 1254, 1240, 260..."
0,137581,"[5291, 6852, 7116, 903, 2019, 5952, 1089, 4226..."
0,137675,"[8169, 1503, 2539, 1367, 6658, 924, 7566, 1918..."


In [35]:
export = df_pivoted.explode('item').reset_index(drop=True)

In [38]:
export = export.groupby('user').head(10)

In [39]:
export.to_csv(os.path.join(args.result_path, 'submit_data_weight.csv'), index=False)