In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

data_dir = '/data/ephemeral/home/data'

In [16]:
# 3월 이전, 3월, 3월 이후 데이터 분류
def split_by_date_march(file_path):
    try:
        df = pd.read_csv(file_path)
    except UnicodeDecodeError:
        print(f"Unicode decoding error in file: {file_path}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    # 데이터가 비어 있으면 바로 반환
    if df.empty:
        print(f"No data in file: {file_path}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    # ID 열을 datetime 형식으로 변환 (ID 열이 '2023-01-01 00:00:00' 형식의 값들을 가지고 있음)
    if 'ID' in df.columns:
        df['datetime'] = pd.to_datetime(df['ID'])
    else:
        print(f"No 'ID' column in file: {file_path}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    # 2023년 데이터 필터링
    df_2023 = df[(df['datetime'] >= '2023-01-01') & (df['datetime'] < '2024-01-01')]
    if df_2023.empty:
        print(f"No 2023 data in file: {file_path}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    # 3월 기준으로 데이터 분할
    pre_march = df_2023[df_2023['datetime'] < '2023-03-01']
    march = df_2023[(df_2023['datetime'] >= '2023-03-01') & (df_2023['datetime'] < '2023-04-01')]
    post_march = df_2023[df_2023['datetime'] >= '2023-04-01']

    return pre_march, march, post_march

In [8]:
def hierarchical_sampling(df, target_column='target', test_size=0.2):
    # 데이터에 'target' 열이 있는지 확인
    if target_column not in df.columns:
        print(f"'{target_column}' column not found in the dataframe. Skipping this dataset.")
        return pd.DataFrame(), pd.DataFrame()

    # 계층적 샘플링을 위해 train_test_split 사용
    train, test = train_test_split(df, test_size=test_size, stratify=df[target_column])
    return train, test


In [13]:
def process_files():
    files = os.listdir(data_dir)

    pre_march_list = []
    march_list = []
    post_march_list = []

    for file in tqdm(files):
        file_path = os.path.join(data_dir, file)
        
        # train.csv 파일에만 'target' 열이 있으므로 따로 처리
        if 'train.csv' in file:
            print(f"Processing target file: {file_path}")
            pre_march, march, post_march = split_by_date_march(file_path)
            
            if not pre_march.empty:
                pre_march_list.append(pre_march)
            if not march.empty:
                march_list.append(march)
            if not post_march.empty:
                post_march_list.append(post_march)

    # 데이터가 있는 경우에만 concat
    pre_march_df = pd.concat(pre_march_list) if pre_march_list else pd.DataFrame()
    march_df = pd.concat(march_list) if march_list else pd.DataFrame()
    post_march_df = pd.concat(post_march_list) if post_march_list else pd.DataFrame()

    if not pre_march_df.empty:
        pre_march_train, pre_march_test = hierarchical_sampling(pre_march_df)
    else:
        pre_march_train, pre_march_test = pd.DataFrame(), pd.DataFrame()

    if not march_df.empty:
        march_train, march_test = hierarchical_sampling(march_df)
    else:
        march_train, march_test = pd.DataFrame(), pd.DataFrame()

    if not post_march_df.empty:
        post_march_train, post_march_test = hierarchical_sampling(post_march_df)
    else:
        post_march_train, post_march_test = pd.DataFrame(), pd.DataFrame()

    return pre_march_train, pre_march_test, march_train, march_test, post_march_train, post_march_test


In [17]:
pre_march_train, pre_march_test, march_train, march_test, post_march_train, post_march_test = process_files()

print(f'Pre-March train size: {len(pre_march_train)}, test size: {len(pre_march_test)}')
print(f'March train size: {len(march_train)}, test size: {len(march_test)}')
print(f'Post-March train size: {len(post_march_train)}, test size: {len(post_march_test)}')

100%|██████████| 116/116 [00:00<00:00, 8346.44it/s]

Processing target file: /data/ephemeral/home/data/._train.csv
Unicode decoding error in file: /data/ephemeral/home/data/._train.csv
Processing target file: /data/ephemeral/home/data/train.csv
Pre-March train size: 1132, test size: 284
March train size: 595, test size: 149
Post-March train size: 5280, test size: 1320



