In [18]:
import os
import argparse
import pandas as pd
import math
from tqdm import tqdm

In [19]:
parser = argparse.ArgumentParser(description="parser")
arg = parser.add_argument

arg("--file_path", "-fp", type=str, default="./ensembles/", help='앙상블 파일 열기 경로')
arg("--result_path", "-rp", type=str, default="./submit/", help='앙상블 결과 저장 경로')
arg("--files", "-f", nargs="+", default="EASE.csv,ADMMSLIM.csv,MultiDAE.csv", type=lambda s: [item for item in s.split(",")], help="앙상블 파일명(쉼표 구분)")
arg("--weight", "-w", nargs="+", default="3,2,2", type=lambda s: [float(item) for item in s.split(",")], help="앙상블 모델 가중치 설정")

args = parser.parse_args([])

In [20]:
os.makedirs(args.file_path, exist_ok=True)  # 읽어들일 파일 경로
os.makedirs(args.result_path, exist_ok=True)  # 결과 파일 저장 경로

In [21]:
args.weight

[3.0, 2.0, 2.0]

In [22]:
output_path = [args.file_path + file_name for file_name in args.files]  # 앙상블 할 파일 경로 리스트
output_list = [pd.read_csv(path) for path in output_path]  # 앙상블 할 파일 리스트
output_frame = pd.read_csv(output_path[0]).iloc[:, 0]

In [23]:
user_list = output_frame.unique().tolist()

In [24]:
weight_list = []
for weight in args.weight:
    weight = weight
    weight_list.append(weight)

In [25]:
concat_df = pd.DataFrame(columns=['user', 'item'])
# 0번파일, 1번파일을 rank로 정렬하여 weight_list 비율로 뽑는다.
for output in output_list:
    concat_df = pd.concat([concat_df, output], axis=0)

In [26]:
# duplication_list = []
concat_df['count'] = 1

In [27]:
concat_df = concat_df.groupby(['user', 'item']).count().reset_index()
concat_df = concat_df[concat_df['count'] > len(output_list) - 1]

In [28]:
concat_df

Unnamed: 0,user,item,count
0,11,2,3
6,11,4370,3
7,11,4886,3
20,14,919,3
22,14,1198,3
...,...,...,...
566203,138492,608,3
566208,138492,2502,3
566209,138492,2918,3
566211,138492,3897,3


In [29]:
concat_df = concat_df.groupby('user')['item'].apply(list).reset_index()
temp_list = []
for i in range(len(output_list)):
    temp_list.append(output_list[i].groupby('user')['item'].apply(list).reset_index())

In [33]:
concat_df

Unnamed: 0,user,item
0,11,"[2, 4370, 4886]"
1,14,"[919, 1198, 1907]"
2,18,"[296, 1193, 4235]"
3,25,"[47, 1073, 1270, 2329, 2762]"
4,31,"[68954, 79132]"
...,...,...
30097,138473,"[50, 318, 593, 5952, 7361, 7438, 48394]"
30098,138475,"[930, 2726, 3307, 8491]"
30099,138486,"[5679, 6539, 8957]"
30100,138492,"[296, 608, 2502, 2918, 3897]"


In [13]:
final_list = []
user = 11
confirm_list = concat_df[concat_df['user'] == user]['item'].values
if len(confirm_list) > 0:
    confirm_list = concat_df[concat_df['user'] == 11]['item'].values[0]
    decision_list = []
    for temp in temp_list:
        item = temp[temp['user'] == 11]['item'].values[0]
        for confirm in confirm_list:
            if confirm in item:
                item.remove(confirm)
        for weight in weight_list:
            for i in range(len(item)):
                decision_list.append([item[i], weight])

In [14]:
decision_list

[[47, 3.0],
 [40815, 3.0],
 [33004, 3.0],
 [7438, 3.0],
 [7373, 3.0],
 [8961, 3.0],
 [32587, 3.0],
 [47, 2.0],
 [40815, 2.0],
 [33004, 2.0],
 [7438, 2.0],
 [7373, 2.0],
 [8961, 2.0],
 [32587, 2.0],
 [47, 2.0],
 [40815, 2.0],
 [33004, 2.0],
 [7438, 2.0],
 [7373, 2.0],
 [8961, 2.0],
 [32587, 2.0],
 [40815, 3.0],
 [7373, 3.0],
 [8961, 3.0],
 [32587, 3.0],
 [7438, 3.0],
 [3703, 3.0],
 [47, 3.0],
 [40815, 2.0],
 [7373, 2.0],
 [8961, 2.0],
 [32587, 2.0],
 [7438, 2.0],
 [3703, 2.0],
 [47, 2.0],
 [40815, 2.0],
 [7373, 2.0],
 [8961, 2.0],
 [32587, 2.0],
 [7438, 2.0],
 [3703, 2.0],
 [47, 2.0],
 [3986, 3.0],
 [55232, 3.0],
 [1831, 3.0],
 [61350, 3.0],
 [37386, 3.0],
 [8861, 3.0],
 [2054, 3.0],
 [3986, 2.0],
 [55232, 2.0],
 [1831, 2.0],
 [61350, 2.0],
 [37386, 2.0],
 [8861, 2.0],
 [2054, 2.0],
 [3986, 2.0],
 [55232, 2.0],
 [1831, 2.0],
 [61350, 2.0],
 [37386, 2.0],
 [8861, 2.0],
 [2054, 2.0]]

In [16]:
temp_dic = {}
for k, v in decision_list:
    if k in temp_dic:
        temp_dic[k] = temp_dic[k] + v
    else:
        temp_dic[k] = v

In [17]:
temp_dic

{47: 14.0,
 40815: 14.0,
 33004: 7.0,
 7438: 14.0,
 7373: 14.0,
 8961: 14.0,
 32587: 14.0,
 3703: 7.0,
 3986: 7.0,
 55232: 7.0,
 1831: 7.0,
 61350: 7.0,
 37386: 7.0,
 8861: 7.0,
 2054: 7.0}

In [161]:
temp_list = [[k, v] for k, v in temp_dic.items()]
temp_list.sort(key=lambda x:-x[1])

In [162]:
temp_list = [k for k, v in temp_list]

In [163]:
temp_list

[47,
 40815,
 7438,
 7373,
 8961,
 32587,
 33004,
 3703,
 3986,
 55232,
 1831,
 61350,
 37386,
 8861,
 2054]

In [164]:
user, (confirm_list + temp_list)[:10]

(11, [2, 4370, 4886, 47, 40815, 7438, 7373, 8961, 32587, 33004])

In [31]:
final_list = []
for user in tqdm(user_list):
    confirm_list = concat_df[concat_df['user'] == user]['item'].values
    if len(confirm_list) > 0:
        confirm_list = concat_df[concat_df['user'] == user]['item'].values[0]
        decision_list = []
        for temp in temp_list:
            item = temp[temp['user'] == user]['item'].values[0]
            for confirm in confirm_list:
                if confirm in item:
                    item.remove(confirm)
            for weight in weight_list:
                for i in range(len(item)):
                    decision_list.append([item[i], weight])
        temp_dic = {}
        for k, v in decision_list:
            if k in temp_dic:
                temp_dic[k] = temp_dic[k] + v
            else:
                temp_dic[k] = v
        item_list = [[k, v] for k, v in temp_dic.items()]
        item_list.sort(key=lambda x:-x[1])
        item_list = [k for k, v in item_list]
        final_list.append([user, (confirm_list + item_list)[:10]])
    else:
        decision_list = []
        for temp in temp_list:
            item = temp[temp['user'] == user]['item'].values[0]
            for weight in weight_list:
                for i in range(len(item)):
                    decision_list.append([item[i], weight])
        temp_dic = {}
        for k, v in decision_list:
            if k in temp_dic:
                temp_dic[k] = temp_dic[k] + v
            else:
                temp_dic[k] = v
        item_list = [[k, v] for k, v in temp_dic.items()]
        item_list.sort(key=lambda x:-x[1])
        item_list = [k for k, v in item_list]
        final_list.append([user, (confirm_list + item_list)[:10]])

  0%|          | 66/31360 [00:00<01:06, 467.71it/s]


ValueError: operands could not be broadcast together with shapes (0,) (22,) 