# Ensemble
- 기본적으로 두 모델(파일)을 앙상블해서 최종 결과를 출력한다.

In [None]:
import os
import pandas as pd
import csv
from datetime import datetime
import pytz

In [None]:
def korea_date_time():
        """
        Retrieves the current date and time in the Korea Standard Time (KST) timezone.

        Returns:
            str: The current date and time formatted as 'YYYY-MM-DD_HH:MM:SS' in KST.
        """
        korea_timezone = pytz.timezone("Asia/Seoul")
        date_time = datetime.now(tz=korea_timezone)
        date_time = date_time.strftime("%Y-%m-%d_%H:%M:%S")
        
        return date_time

## 두 파일을 읽어와 상위 m개와 상위 10-m개를 겹치지 않게 앙상블해서 최종 출력을 생성
### 주의사항: 입력되는 두 파일은 예측값이 높은 순서대로 저장되어 있어야 한다.

In [None]:
def ensemble_top_rating(main_df: pd.DataFrame, sub_df: pd.DataFrame, using_topk: int) -> None:
    """
    Ensemble the top ratings from main_df and additional ratings from sub_df,
    based on the specified number (using_topk) of top ratings to consider.

    Args:
        main_df (pd.DataFrame): The main DataFrame containing ratings.
        sub_df (pd.DataFrame): The additional DataFrame containing ratings.
        using_topk (int): The number of top ratings to consider from main_df.

    Returns:
        None. Saves the ensemble result to a CSV file.

    Raises:
        AssertionError: If using_topk is not an integer or falls outside the range of 0 to 10 (inclusive).
    """
    assert isinstance(using_topk, int) and 0 <= using_topk <= 10, "using_topk should be an integer between 0 and 10 (inclusive)"

    # Select the top ratings from main_df
    main_df = main_df.groupby('user').head(using_topk)

    # Select additional ratings from sub_df that are not present in main_df
    sub_df = pd.merge(main_df, sub_df, how='outer', indicator=True).query('_merge == "right_only"').drop(columns=['_merge'])
    sub_df = sub_df.groupby('user').head(10 - using_topk)

    # Concatenate main_df and sub_df
    output = pd.concat([main_df, sub_df])
    output = output.sort_values('user')

    # Create the output folder
    output_folder = '../output'
    os.makedirs(output_folder, exist_ok=True)

    # Save the ensemble result to a CSV file
    date_time = korea_date_time()
    file_name = f'{output_folder}/ensemble-{date_time}.csv'
    output.to_csv(file_name, index=False)

    print(f"{file_name} is successfully saved!")


In [None]:
output1 = pd.read_csv('../output/output1.csv') 
output2 = pd.read_csv('../output/output2.csv') 
using_topk = 5
ensemble_top_rating(output1, output2, using_topk)

## 상호작용한 횟수를 기준으로 유저를 나눠서 다른 모델을 적용!

In [None]:
def ensemble_interaction(train_df: pd.DataFrame, less_df: pd.DataFrame, much_df: pd.DataFrame, num_interaction: int) -> None:
    """
    Perform ensemble interaction based on the number of interactions for each user,
    concatenate the resulting DataFrames, and save the output to a CSV file.

    Args:
        train_df (pd.DataFrame): DataFrame containing training data.
        less_df (pd.DataFrame): DataFrame containing results of models that behave well for data with less interaction.
        much_df (pd.DataFrame): DataFrame containing results of models tthat behave well for data with a lot of interaction.
        num_interaction (int): Threshold for the number of interactions to split the users.

    Returns:
        None
    """
    
    grouped = train_df.groupby('user').size().reset_index(name='num_inter')

    split_under = grouped[grouped['num_inter'] <= num_interaction]['user'].values.tolist()

    split_over = grouped[grouped['num_inter'] > num_interaction]['user'].values.tolist()

    less_df = less_df[less_df['user'].isin(split_under)]

    much_df = much_df[much_df['user'].isin(split_over)]

    output = pd.concat([less_df, much_df])
    output.sort_index()

    # Create the output folder
    output_folder = '../output'
    os.makedirs(output_folder, exist_ok=True)

    date_time = korea_date_time()
    file_name = f'{output_folder}/ensemble-{date_time}.csv'
    output.to_csv(file_name, index=False)

    print(f"{file_name} is successfully saved!")


In [None]:
#전체 훈련 데이터와 앙상블할 파일을 읽어온다.
train_df = pd.read_csv('../data/train/train_ratings.csv')
output1 = pd.read_csv('../output/output1.csv') #cold-start에 강한 모델 사용
output2 = pd.read_csv('../output/output2.csv') #sequential하거나 정보가 많을 때 효과적인 모델 사용

split_num = [114, 265, 499, 749] #상호작용 횟수 기준

ensemble_interaction(train_df, output1, output2, split_num[2])

# 기타 함수들

In [None]:
#앙상블 파일 중복 확인
def check_duplicates(df, lst):
    duplicates = df[df.duplicated(subset=lst)]

    if duplicates.empty:
        print('중복된 데이터가 없습니다.')
    else:
        print('중복된 데이터:')
        print(duplicates)

df = pd.read_csv('')
check_duplicates(df, ['user', 'item'])