## Import

In [None]:
# %pip install pyarrow
# %pip install xgboost

In [1]:
import pandas as pd
import numpy as np
import gc

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

## Data Load

In [2]:
# 데이터 분할(폴더) 구분
data_splits = ["train", "test"]

# 각 데이터 유형별 폴더명, 파일 접미사, 변수 접두어 설정
data_categories = {
    "회원정보": {"folder": "1.회원정보", "suffix": "회원정보", "var_prefix": "customer"},
    "신용정보": {"folder": "2.신용정보", "suffix": "신용정보", "var_prefix": "credit"},
    "승인매출정보": {"folder": "3.승인매출정보", "suffix": "승인매출정보", "var_prefix": "sales"},
    "청구정보": {"folder": "4.청구입금정보", "suffix": "청구정보", "var_prefix": "billing"},
    "잔액정보": {"folder": "5.잔액정보", "suffix": "잔액정보", "var_prefix": "balance"},
    "채널정보": {"folder": "6.채널정보", "suffix": "채널정보", "var_prefix": "channel"},
    "마케팅정보": {"folder": "7.마케팅정보", "suffix": "마케팅정보", "var_prefix": "marketing"},
    "성과정보": {"folder": "8.성과정보", "suffix": "성과정보", "var_prefix": "performance"}
}

# 2018년 7월부터 12월까지의 월 리스트
months = ['07', '08', '09', '10', '11', '12']

for split in data_splits:
    for category, info in data_categories.items():
        folder = info["folder"]
        suffix = info["suffix"]
        var_prefix = info["var_prefix"]
        
        for month in months:
            # 파일명 형식: 2018{month}_{split}_{suffix}.parquet
            file_path = f"../data/{split}/{folder}/2018{month}_{split}_{suffix}.parquet"
            print(file_path)
            # 변수명 형식: {var_prefix}_{split}_{month}
            variable_name = f"{var_prefix}_{split}_{month}"
            globals()[variable_name] = pd.read_parquet(file_path)
            print(f"{variable_name} is loaded from {file_path}")

gc.collect()

../data/train/1.회원정보/201807_train_회원정보.parquet
customer_train_07 is loaded from ../data/train/1.회원정보/201807_train_회원정보.parquet
../data/train/1.회원정보/201808_train_회원정보.parquet
customer_train_08 is loaded from ../data/train/1.회원정보/201808_train_회원정보.parquet
../data/train/1.회원정보/201809_train_회원정보.parquet
customer_train_09 is loaded from ../data/train/1.회원정보/201809_train_회원정보.parquet
../data/train/1.회원정보/201810_train_회원정보.parquet
customer_train_10 is loaded from ../data/train/1.회원정보/201810_train_회원정보.parquet
../data/train/1.회원정보/201811_train_회원정보.parquet
customer_train_11 is loaded from ../data/train/1.회원정보/201811_train_회원정보.parquet
../data/train/1.회원정보/201812_train_회원정보.parquet
customer_train_12 is loaded from ../data/train/1.회원정보/201812_train_회원정보.parquet
../data/train/2.신용정보/201807_train_신용정보.parquet
credit_train_07 is loaded from ../data/train/2.신용정보/201807_train_신용정보.parquet
../data/train/2.신용정보/201808_train_신용정보.parquet
credit_train_08 is loaded from ../data/train/2.신용정보/201808_train_신

0

## Data Preprocessing

### 1. Concat & Merge

In [3]:
# 데이터 유형별 설정 
info_categories = ["customer", "credit", "sales", "billing", "balance", "channel", "marketing", "performance"]

# 월 설정
months = ['07', '08', '09', '10', '11', '12']

In [9]:
#### Train ####

# 각 유형별로 월별 데이터를 합쳐서 새로운 변수에 저장
train_dfs = {}

for prefix in info_categories:
    # globals()에서 동적 변수명으로 데이터프레임들을 가져와 리스트에 저장
    df_list = [globals()[f"{prefix}_train_{month}"] for month in months]
    train_dfs[f"{prefix}_train_df"] = pd.concat(df_list, axis=0)
    gc.collect()
    print(f"{prefix}_train_df is created with shape: {train_dfs[f'{prefix}_train_df'].shape}")


customer_train_df = train_dfs["customer_train_df"]
credit_train_df   = train_dfs["credit_train_df"]
sales_train_df    = train_dfs["sales_train_df"]
billing_train_df  = train_dfs["billing_train_df"]
balance_train_df  = train_dfs["balance_train_df"]
channel_train_df  = train_dfs["channel_train_df"]
marketing_train_df= train_dfs["marketing_train_df"]
performance_train_df = train_dfs["performance_train_df"]

gc.collect()

customer_train_df is created with shape: (2400000, 78)
credit_train_df is created with shape: (2400000, 42)
sales_train_df is created with shape: (2400000, 406)
billing_train_df is created with shape: (2400000, 46)
balance_train_df is created with shape: (2400000, 82)
channel_train_df is created with shape: (2400000, 105)
marketing_train_df is created with shape: (2400000, 64)
performance_train_df is created with shape: (2400000, 49)


0

In [10]:
#### Test ####

# test 데이터에 대해 train과 동일한 방법 적용
test_dfs = {}

for prefix in info_categories:
    df_list = [globals()[f"{prefix}_test_{month}"] for month in months]
    test_dfs[f"{prefix}_test_df"] = pd.concat(df_list, axis=0)
    gc.collect()
    print(f"{prefix}_test_df is created with shape: {test_dfs[f'{prefix}_test_df'].shape}")


customer_test_df = test_dfs["customer_test_df"]
credit_test_df   = test_dfs["credit_test_df"]
sales_test_df    = test_dfs["sales_test_df"]
billing_test_df  = test_dfs["billing_test_df"]
balance_test_df  = test_dfs["balance_test_df"]
channel_test_df  = test_dfs["channel_test_df"]
marketing_test_df= test_dfs["marketing_test_df"]
performance_test_df = test_dfs["performance_test_df"]

gc.collect()

customer_test_df is created with shape: (600000, 77)
credit_test_df is created with shape: (600000, 42)
sales_test_df is created with shape: (600000, 406)
billing_test_df is created with shape: (600000, 46)
balance_test_df is created with shape: (600000, 82)
channel_test_df is created with shape: (600000, 105)
marketing_test_df is created with shape: (600000, 64)
performance_test_df is created with shape: (600000, 49)


0

In [11]:
#### Train ####

train_df = customer_train_df.merge(credit_train_df, on=['기준년월', 'ID'], how='left')
print("Step1 저장 완료: train_step1, shape:", train_df.shape)
del customer_train_df, credit_train_df
gc.collect()

# 이후 merge할 데이터프레임 이름과 단계 정보를 리스트에 저장
merge_list = [
    ("sales_train_df",    "Step2"),
    ("billing_train_df",  "Step3"),
    ("balance_train_df",  "Step4"),
    ("channel_train_df",  "Step5"),
    ("marketing_train_df","Step6"),
    ("performance_train_df", "최종")
]

# 나머지 단계 merge
for df_name, step in merge_list:
    # globals()로 동적 변수 접근하여 merge 수행
    train_df = train_df.merge(globals()[df_name], on=['기준년월', 'ID'], how='left')
    print(f"{step} 저장 완료: train_{step}, shape:", train_df.shape)
    # 사용한 변수는 메모리 해제를 위해 삭제
    del globals()[df_name]
    gc.collect()

Step1 저장 완료: train_step1, shape: (2400000, 118)
Step2 저장 완료: train_Step2, shape: (2400000, 522)
Step3 저장 완료: train_Step3, shape: (2400000, 566)
Step4 저장 완료: train_Step4, shape: (2400000, 646)
Step5 저장 완료: train_Step5, shape: (2400000, 749)
Step6 저장 완료: train_Step6, shape: (2400000, 811)
최종 저장 완료: train_최종, shape: (2400000, 858)


In [12]:
#### Test ####

test_df = customer_test_df.merge(credit_test_df, on=['기준년월', 'ID'], how='left')
print("Step1 저장 완료: test_step1, shape:", test_df.shape)
del customer_test_df, credit_test_df
gc.collect()

# 이후 merge할 데이터프레임 이름과 단계 정보를 리스트에 저장
merge_list = [
    ("sales_test_df",    "Step2"),
    ("billing_test_df",  "Step3"),
    ("balance_test_df",  "Step4"),
    ("channel_test_df",  "Step5"),
    ("marketing_test_df","Step6"),
    ("performance_test_df", "최종")
]

# 나머지 단계 merge
for df_name, step in merge_list:
    # globals()로 동적 변수 접근하여 merge 수행
    test_df = test_df.merge(globals()[df_name], on=['기준년월', 'ID'], how='left')
    print(f"{step} 저장 완료: test_{step}, shape:", test_df.shape)
    # 사용한 변수는 메모리 해제를 위해 삭제
    del globals()[df_name]
    gc.collect()

Step1 저장 완료: test_step1, shape: (600000, 117)
Step2 저장 완료: test_Step2, shape: (600000, 521)
Step3 저장 완료: test_Step3, shape: (600000, 565)
Step4 저장 완료: test_Step4, shape: (600000, 645)
Step5 저장 완료: test_Step5, shape: (600000, 748)
Step6 저장 완료: test_Step6, shape: (600000, 810)
최종 저장 완료: test_최종, shape: (600000, 857)


In [13]:
# dataset 저장
train_df.to_csv("../data/all_train.csv")
test_df.to_csv("../data/all_test.csv")

### 2. Encoding

In [14]:
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]

X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()

# 타깃 라벨 인코딩
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)


categorical_features = X.select_dtypes(include=['object']).columns.tolist()

X_test = test_df.copy()

encoders = {}  # 각 컬럼별 encoder 저장

for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    if unseen_labels_val:
        le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    X_test[col] = le_train.transform(X_test[col])

### Train

In [None]:
print(X.isnull().sum())
# print(y_encoded.isnull().sum())
print(y_encoded)
print(X)
for col in X.columns:
    mean_val = X[col].mean()
    X[col].fillna(mean_val, inplace=True)

In [15]:
print(X.isnull().sum())
# print(y_encoded.isnull().sum())
print(y_encoded)
print(X)
X.fillna(0, inplace=True)

기준년월                   0
남녀구분코드                 0
연령                     0
회원여부_이용가능              0
회원여부_이용가능_CA           0
                   ...  
변동률_잔액_B1M             0
변동률_잔액_일시불_B1M         0
변동률_잔액_CA_B1M          0
혜택수혜율_R3M         488746
혜택수혜율_B0M         555522
Length: 856, dtype: int64
[3 4 2 ... 2 4 4]
           기준년월  남녀구분코드  연령  회원여부_이용가능  회원여부_이용가능_CA  회원여부_이용가능_카드론  소지여부_신용  \
0        201807       2   2          1             1              0        1   
1        201807       1   1          1             1              1        1   
2        201807       1   1          1             1              0        1   
3        201807       2   2          1             1              0        1   
4        201807       2   2          1             1              1        1   
...         ...     ...  ..        ...           ...            ...      ...   
2399995  201812       2   5          1             1              1        1   
2399996  201812       2   3          1   

In [16]:
model = xgb.XGBClassifier(tree_method='hist', random_state = 42)  # CPU 기반 학습
model.fit(X, y_encoded)

### Predict

In [17]:
X_test.drop(columns=['ID'],inplace=True)

In [18]:
# row-level 예측 수행
y_test_pred = model.predict(X_test)
# 예측 결과를 변환
y_test_pred_labels = le_target.inverse_transform(y_test_pred)

# row 단위 예측 결과를 test_data에 추가
test_data = test_df.copy()  # 원본 유지
test_data["pred_label"] = y_test_pred_labels

### Submission

In [19]:
submission = test_data.groupby("ID")["pred_label"] \
    .agg(lambda x: x.value_counts().idxmax()) \
    .reset_index()

submission.columns = ["ID", "Segment"]
submission.to_csv('../submit/base_submit.csv',index=False)

In [None]:
# XGBoost

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# 데이터를 8:2로 나눔
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# 모델 학습
model = xgb.XGBClassifier(tree_method='hist', random_state=42)
model.fit(X_train, y_train)

# 검증 세트 예측
y_val_pred = model.predict(X_val)

# F1 score 계산
f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"Validation Macro F1 Score: {f1:.4f}")


Validation Macro F1 Score: 0.7824


In [27]:
# 1. 전체 X 데이터로 간단히 XGBoost 학습
xgb_model = xgb.XGBClassifier(tree_method='hist', random_state=42)
xgb_model.fit(X, y_encoded)

# 2. 피처 중요도 추출
importances = xgb_model.feature_importances_
feature_names = X.columns

# 3. 중요도 높은 상위 N개 피처 선택 (예: 300개)
top_n = 300
top_features = feature_names[np.argsort(importances)[-top_n:]]

# 4. X를 중요 피처만 남기기
X_selected = X[top_features]


In [24]:
%pip install lightgbm catboost

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.7-cp39-cp39-win_amd64.whl.metadata (1.2 kB)
Collecting numpy>=1.17.0 (from lightgbm)
  Using cached numpy-1.26.4-cp39-cp39-win_amd64.whl.metadata (61 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.0.1-py3-none-any.whl.metadata (6.7 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-1.32.0-py3-none-any.whl.metadata (9.2 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 38.4 MB/s eta 0:00:00
Downloading catboost-1.2.7-cp39-cp39-win_amd64.whl (101.8 MB)
   ---------------------------------------- 0.0/101.8 MB ? eta -:--:--
   --- ------------------------------------ 8.9/101.8 MB 46.0 MB/s eta 0:00:03
   --- ------------------------------------ 9.4/101.8 MB 45.2 MB/s eta 0:00:03
 

  You can safely remove it manually.
  You can safely remove it manually.


In [28]:
# LightGBM

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# train/validation split
X_train, X_val, y_train, y_val = train_test_split(X_selected, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# LightGBM 모델 정의 및 학습
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)

# 예측 및 평가
y_val_pred_lgb = lgb_model.predict(X_val)
f1_lgb = f1_score(y_val, y_val_pred_lgb, average='macro')
print(f"LightGBM Validation Macro F1 Score: {f1_lgb:.4f}")


[WinError 2] 지정된 파일을 찾을 수 없습니다
  File "c:\Users\SSAFY\Desktop\개인폴더\DACON\.venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Users\SSAFY\AppData\Local\Programs\Python\Python39\lib\subprocess.py", line 505, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\SSAFY\AppData\Local\Programs\Python\Python39\lib\subprocess.py", line 951, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\SSAFY\AppData\Local\Programs\Python\Python39\lib\subprocess.py", line 1420, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.796840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43004
[LightGBM] [Info] Number of data points in the train set: 1920000, number of used features: 300
[LightGBM] [Info] Start training from score -7.811109
[LightGBM] [Info] Start training from score -9.722904
[LightGBM] [Info] Start training from score -2.934402
[LightGBM] [Info] Start training from score -1.927457
[LightGBM] [Info] Start training from score -0.222076
LightGBM Validation Macro F1 Score: 0.5077


In [44]:
# ----------------------------------------------
# 1. 메모리 초기화
# ----------------------------------------------
import gc
import pandas as pd
import numpy as np

gc.collect()

# ----------------------------------------------
# 2. 데이터 로딩
# ----------------------------------------------
train_df = pd.read_csv('../data/all_train.csv')
test_df = pd.read_csv('../data/all_test.csv')

# ----------------------------------------------
# 3. X, y 분리
# ----------------------------------------------
target_col = 'target'  # ← 실제 타겟 컬럼명으로 수정해줘야 함
X = train_df.drop(columns=[target_col])
y = train_df[target_col]

# ----------------------------------------------
# 4. 결측치 처리 + 메모리 절약 (float32 다운캐스팅)
# ----------------------------------------------
for col in X.columns:
    if X[col].isnull().any():
        X[col].fillna(X[col].mean(), inplace=True)

X = X.astype('float32')
test_df = test_df.astype('float32')

# ----------------------------------------------
# 5. PCA 차원 축소
# ----------------------------------------------
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, random_state=42)
X_reduced = pca.fit_transform(X)
test_reduced = pca.transform(test_df)

# ----------------------------------------------
# 6. 학습/검증 분리
# ----------------------------------------------
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_reduced, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------------------------
# 7. LightGBM 모델 훈련
# ----------------------------------------------
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

model = LGBMClassifier(n_estimators=100, max_depth=8, random_state=42)
model.fit(X_train, y_train)

# ----------------------------------------------
# 8. 검증 평가
# ----------------------------------------------
y_pred = model.predict(X_val)
print("Validation Macro F1 Score:", f1_score(y_val, y_pred, average='macro'))

# ----------------------------------------------
# 9. 테스트 데이터 예측 (필요시)
# ----------------------------------------------
# test_pred = model.predict(test_reduced)


MemoryError: Unable to allocate 18.3 MiB for an array with shape (2400000,) and data type int64

In [32]:
%pip uninstall catboost numpy -y


Found existing installation: catboost 1.2.7
Uninstalling catboost-1.2.7:
  Successfully uninstalled catboost-1.2.7
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Note: you may need to restart the kernel to use updated packages.


You can safely remove it manually.


In [33]:
%pip install numpy==1.24.4


Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp39-cp39-win_amd64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp39-cp39-win_amd64.whl (14.9 MB)
   ---------------------------------------- 0.0/14.9 MB ? eta -:--:--
   ---------------- ----------------------- 6.3/14.9 MB 55.3 MB/s eta 0:00:01
   ------------------------------- -------- 11.8/14.9 MB 30.8 MB/s eta 0:00:01
   ---------------------------------------- 14.9/14.9 MB 26.7 MB/s eta 0:00:00
Installing collected packages: numpy
Successfully installed numpy-1.24.4
Note: you may need to restart the kernel to use updated packages.


In [34]:
%pip install catboost


Collecting catboost
  Using cached catboost-1.2.7-cp39-cp39-win_amd64.whl.metadata (1.2 kB)
Using cached catboost-1.2.7-cp39-cp39-win_amd64.whl (101.8 MB)
Installing collected packages: catboost
Successfully installed catboost-1.2.7
Note: you may need to restart the kernel to use updated packages.


In [37]:
import numpy
import catboost
print(numpy.__version__)
print(catboost.__version__)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [30]:
# CatBoost

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# train/validation split
X_train, X_val, y_train, y_val = train_test_split(X_selected, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# CatBoost 모델 정의 및 학습
cat_model = CatBoostClassifier(verbose=0, random_state=42)
cat_model.fit(X_train, y_train)

# 예측 및 평가
y_val_pred_cat = cat_model.predict(X_val)
f1_cat = f1_score(y_val, y_val_pred_cat, average='macro')
print(f"CatBoost Validation Macro F1 Score: {f1_cat:.4f}")


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject