<a href="https://colab.research.google.com/github/Yeyeong99/Aiffel/blob/main/contest/%EC%8B%A0%EC%9A%A9%EC%B9%B4%EB%93%9C_%EA%B3%A0%EA%B0%9D_%EC%84%B8%EA%B7%B8%EB%A8%BC%ED%8A%B8_%EB%B6%84%EB%A5%98/CatBoost_%ED%99%9C%EC%9A%A9_%EC%8B%A0%EC%9A%A9%EC%B9%B4%EB%93%9C_%EA%B3%A0%EA%B0%9D_%EC%84%B8%EA%B7%B8%EB%A8%BC%ED%8A%B8_%EB%B6%84%EB%A5%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import

In [1]:
import pandas as pd
import numpy as np
import gc

from sklearn.preprocessing import LabelEncoder

### Data Load

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 데이터 분할(폴더) 구분
data_splits = ["train", "test"]

# 각 데이터 유형별 폴더명, 파일 접미사, 변수 접두어 설정
data_categories = {
    "회원정보": {"folder": "1.회원정보", "suffix": "회원정보", "var_prefix": "customer"},
    "신용정보": {"folder": "2.신용정보", "suffix": "신용정보", "var_prefix": "credit"},
    "승인매출정보": {"folder": "3.승인매출정보", "suffix": "승인매출정보", "var_prefix": "sales"},
    "청구정보": {"folder": "4.청구입금정보", "suffix": "청구정보", "var_prefix": "billing"},
    "잔액정보": {"folder": "5.잔액정보", "suffix": "잔액정보", "var_prefix": "balance"},
    "채널정보": {"folder": "6.채널정보", "suffix": "채널정보", "var_prefix": "channel"},
    "마케팅정보": {"folder": "7.마케팅정보", "suffix": "마케팅정보", "var_prefix": "marketing"},
    "성과정보": {"folder": "8.성과정보", "suffix": "성과정보", "var_prefix": "performance"}
}

# 2018년 7월부터 12월까지의 월 리스트
months = ['07', '08', '09', '10', '11', '12']

for split in data_splits:
    for category, info in data_categories.items():
        folder = info["folder"]
        suffix = info["suffix"]
        var_prefix = info["var_prefix"]

        for month in months:
            #
            # 파일명 형식: /content/drive/MyDrive/{split}/2018{month}_{split}_{suffix}.parquet
            file_path = f"/content/drive/MyDrive/{split}/{folder}/2018{month}_{split}_{suffix}.parquet"
            # 변수명 형식: {var_prefix}_{split}_{month}
            variable_name = f"{var_prefix}_{split}_{month}"
            globals()[variable_name] = pd.read_parquet(file_path)
            print(f"{variable_name} is loaded from {file_path}")

gc.collect()

customer_train_07 is loaded from /content/drive/MyDrive/train/1.회원정보/201807_train_회원정보.parquet
customer_train_08 is loaded from /content/drive/MyDrive/train/1.회원정보/201808_train_회원정보.parquet
customer_train_09 is loaded from /content/drive/MyDrive/train/1.회원정보/201809_train_회원정보.parquet
customer_train_10 is loaded from /content/drive/MyDrive/train/1.회원정보/201810_train_회원정보.parquet
customer_train_11 is loaded from /content/drive/MyDrive/train/1.회원정보/201811_train_회원정보.parquet
customer_train_12 is loaded from /content/drive/MyDrive/train/1.회원정보/201812_train_회원정보.parquet
credit_train_07 is loaded from /content/drive/MyDrive/train/2.신용정보/201807_train_신용정보.parquet
credit_train_08 is loaded from /content/drive/MyDrive/train/2.신용정보/201808_train_신용정보.parquet
credit_train_09 is loaded from /content/drive/MyDrive/train/2.신용정보/201809_train_신용정보.parquet
credit_train_10 is loaded from /content/drive/MyDrive/train/2.신용정보/201810_train_신용정보.parquet
credit_train_11 is loaded from /content/drive/MyDrive/trai

0

### Data Preprocessing(1) : Concat & Merge

In [4]:
# 데이터 유형별 설정
info_categories = ["customer", "credit", "sales", "billing", "balance", "channel", "marketing", "performance"]

# 월 설정
months = ['07', '08', '09', '10', '11', '12']

In [5]:
#### Train ####

# 각 유형별로 월별 데이터를 합쳐서 새로운 변수에 저장
train_dfs = {}

for prefix in info_categories:
    # globals()에서 동적 변수명으로 데이터프레임들을 가져와 리스트에 저장
    df_list = [globals()[f"{prefix}_train_{month}"] for month in months]
    train_dfs[f"{prefix}_train_df"] = pd.concat(df_list, axis=0)
    gc.collect()
    print(f"{prefix}_train_df is created with shape: {train_dfs[f'{prefix}_train_df'].shape}")


customer_train_df = train_dfs["customer_train_df"]
credit_train_df   = train_dfs["credit_train_df"]
sales_train_df    = train_dfs["sales_train_df"]
billing_train_df  = train_dfs["billing_train_df"]
balance_train_df  = train_dfs["balance_train_df"]
channel_train_df  = train_dfs["channel_train_df"]
marketing_train_df= train_dfs["marketing_train_df"]
performance_train_df = train_dfs["performance_train_df"]

gc.collect()

customer_train_df is created with shape: (2400000, 78)
credit_train_df is created with shape: (2400000, 42)
sales_train_df is created with shape: (2400000, 406)
billing_train_df is created with shape: (2400000, 46)
balance_train_df is created with shape: (2400000, 82)
channel_train_df is created with shape: (2400000, 105)
marketing_train_df is created with shape: (2400000, 64)
performance_train_df is created with shape: (2400000, 49)


0

In [6]:
#### Test ####

# test 데이터에 대해 train과 동일한 방법 적용
test_dfs = {}

for prefix in info_categories:
    df_list = [globals()[f"{prefix}_test_{month}"] for month in months]
    test_dfs[f"{prefix}_test_df"] = pd.concat(df_list, axis=0)
    gc.collect()
    print(f"{prefix}_test_df is created with shape: {test_dfs[f'{prefix}_test_df'].shape}")


customer_test_df = test_dfs["customer_test_df"]
credit_test_df   = test_dfs["credit_test_df"]
sales_test_df    = test_dfs["sales_test_df"]
billing_test_df  = test_dfs["billing_test_df"]
balance_test_df  = test_dfs["balance_test_df"]
channel_test_df  = test_dfs["channel_test_df"]
marketing_test_df= test_dfs["marketing_test_df"]
performance_test_df = test_dfs["performance_test_df"]

gc.collect()

customer_test_df is created with shape: (600000, 77)
credit_test_df is created with shape: (600000, 42)
sales_test_df is created with shape: (600000, 406)
billing_test_df is created with shape: (600000, 46)
balance_test_df is created with shape: (600000, 82)
channel_test_df is created with shape: (600000, 105)
marketing_test_df is created with shape: (600000, 64)
performance_test_df is created with shape: (600000, 49)


0

In [7]:
#### Train ####

train_df = customer_train_df.merge(credit_train_df, on=['기준년월', 'ID'], how='left')
print("Step1 저장 완료: train_step1, shape:", train_df.shape)
del customer_train_df, credit_train_df
gc.collect()

# 이후 merge할 데이터프레임 이름과 단계 정보를 리스트에 저장
merge_list = [
    ("sales_train_df",    "Step2"),
    ("billing_train_df",  "Step3"),
    ("balance_train_df",  "Step4"),
    ("channel_train_df",  "Step5"),
    ("marketing_train_df","Step6"),
    ("performance_train_df", "최종")
]

# 나머지 단계 merge
for df_name, step in merge_list:
    # globals()로 동적 변수 접근하여 merge 수행
    train_df = train_df.merge(globals()[df_name], on=['기준년월', 'ID'], how='left')
    print(f"{step} 저장 완료: train_{step}, shape:", train_df.shape)
    # 사용한 변수는 메모리 해제를 위해 삭제
    del globals()[df_name]
    gc.collect()

Step1 저장 완료: train_step1, shape: (2400000, 118)
Step2 저장 완료: train_Step2, shape: (2400000, 522)
Step3 저장 완료: train_Step3, shape: (2400000, 566)
Step4 저장 완료: train_Step4, shape: (2400000, 646)
Step5 저장 완료: train_Step5, shape: (2400000, 749)
Step6 저장 완료: train_Step6, shape: (2400000, 811)
최종 저장 완료: train_최종, shape: (2400000, 858)


In [8]:
#### Test ####

test_df = customer_test_df.merge(credit_test_df, on=['기준년월', 'ID'], how='left')
print("Step1 저장 완료: test_step1, shape:", test_df.shape)
del customer_test_df, credit_test_df
gc.collect()

# 이후 merge할 데이터프레임 이름과 단계 정보를 리스트에 저장
merge_list = [
    ("sales_test_df",    "Step2"),
    ("billing_test_df",  "Step3"),
    ("balance_test_df",  "Step4"),
    ("channel_test_df",  "Step5"),
    ("marketing_test_df","Step6"),
    ("performance_test_df", "최종")
]

# 나머지 단계 merge
for df_name, step in merge_list:
    # globals()로 동적 변수 접근하여 merge 수행
    test_df = test_df.merge(globals()[df_name], on=['기준년월', 'ID'], how='left')
    print(f"{step} 저장 완료: test_{step}, shape:", test_df.shape)
    # 사용한 변수는 메모리 해제를 위해 삭제
    del globals()[df_name]
    gc.collect()

Step1 저장 완료: test_step1, shape: (600000, 117)
Step2 저장 완료: test_Step2, shape: (600000, 521)
Step3 저장 완료: test_Step3, shape: (600000, 565)
Step4 저장 완료: test_Step4, shape: (600000, 645)
Step5 저장 완료: test_Step5, shape: (600000, 748)
Step6 저장 완료: test_Step6, shape: (600000, 810)
최종 저장 완료: test_최종, shape: (600000, 857)


### Data Preprocessing(2) : Encoding

In [9]:
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment"]]

X = train_df[feature_cols].copy()
y = train_df["Segment"].copy()

# 타깃 라벨 인코딩
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

In [10]:
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

X_test = test_df.copy()

encoders = {}  # 각 컬럼별 encoder 저장

for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    if unseen_labels_val:
        le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    X_test[col] = le_train.transform(X_test[col])

In [11]:
gc.collect()

0

### Train / Validation Split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

### Train

In [14]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphviz, catboost
Successfully installed catboost-1.2.7 graphviz-0.20.3


In [15]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(iterations=1000,  # 최대 반복 횟수
                           learning_rate=0.05,
                           depth=6,
                           early_stopping_rounds=50,  # 50번 동안 개선 없으면 조기 종료
                           verbose=25)

In [16]:
cat_model.fit(X_train, y_train,
          eval_set=[(X_val, y_val)],
          early_stopping_rounds=50,
          use_best_model=True)

0:	learn: 1.4636473	test: 1.4636336	best: 1.4636336 (0)	total: 6.62s	remaining: 1h 50m 15s
25:	learn: 0.5133175	test: 0.5134812	best: 0.5134812 (25)	total: 3m 1s	remaining: 1h 53m 27s
50:	learn: 0.3611986	test: 0.3614936	best: 0.3614936 (50)	total: 6m 6s	remaining: 1h 53m 43s
75:	learn: 0.3158872	test: 0.3164358	best: 0.3164358 (75)	total: 9m 19s	remaining: 1h 53m 28s
100:	learn: 0.2973065	test: 0.2979440	best: 0.2979440 (100)	total: 12m 29s	remaining: 1h 51m 8s
125:	learn: 0.2871015	test: 0.2878270	best: 0.2878270 (125)	total: 15m 31s	remaining: 1h 47m 42s
150:	learn: 0.2796076	test: 0.2804217	best: 0.2804217 (150)	total: 18m 37s	remaining: 1h 44m 42s
175:	learn: 0.2736720	test: 0.2745512	best: 0.2745512 (175)	total: 21m 38s	remaining: 1h 41m 18s
200:	learn: 0.2689582	test: 0.2699051	best: 0.2699051 (200)	total: 24m 40s	remaining: 1h 38m 3s
225:	learn: 0.2649273	test: 0.2659319	best: 0.2659319 (225)	total: 27m 36s	remaining: 1h 34m 32s
250:	learn: 0.2615276	test: 0.2625858	best: 0.262

<catboost.core.CatBoostClassifier at 0x7e6d8dd22e50>

### Predict

In [None]:
X_test.drop(columns=['ID'],inplace=True)

In [44]:
# row-level 예측 수행
y_test_pred = cat_model.predict(X_test)
y_test_pred

array([['E'],
       ['E'],
       ['D'],
       ...,
       ['E'],
       ['C'],
       ['E']], dtype=object)

In [47]:
y_test_df = pd.DataFrame(y_test_pred, columns=['pred_label'])
y_test_df

Unnamed: 0,pred_label
0,E
1,E
2,D
3,E
4,E
...,...
599995,E
599996,E
599997,E
599998,C


In [45]:
y_flatten = y_test_pred.flatten()
y_flatten

array(['E', 'E', 'D', ..., 'E', 'C', 'E'], dtype=object)

In [23]:
# 예측 결과를 변환
# y_test_pred_labels = le_target.inverse_transform(y_test_pred)

# row 단위 예측 결과를 test_data에 추가
test_data = test_df.copy()  # 원본 유지
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Columns: 857 entries, 기준년월 to 혜택수혜율_B0M
dtypes: float64(61), int64(747), object(49)
memory usage: 3.8+ GB


In [25]:
test_data["pred_label"] = y_test_pred.flatten()

In [48]:
test_data2 = test_df.copy()
test_data2_concat = pd.concat([test_data2, y_test_df], axis=1)

### Submission

In [51]:
test_data2_concat.head()

Unnamed: 0,기준년월,ID,남녀구분코드,연령,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,소지카드수_이용가능_신용,...,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M,pred_label
0,201807,TEST_00000,1,40대,1,1,0,1,2,2,...,0.999998,0.999998,0.999998,0.999998,0.209395,0.231043,0.0,1.33277,1.780392,E
1,201807,TEST_00001,1,60대,1,1,0,1,1,1,...,1.044473,1.991974,0.999998,0.926569,-0.269161,-0.247241,0.0,0.0,0.0,E
2,201807,TEST_00002,1,40대,1,1,1,1,2,2,...,1.053083,0.999998,0.999998,0.999998,-0.12029,0.02927,0.0,4.123738,5.115589,D
3,201807,TEST_00003,2,40대,1,1,1,1,1,1,...,1.99163,0.999998,0.999998,0.999998,0.035807,-0.013359,0.0,0.093615,0.349994,E
4,201807,TEST_00004,2,40대,1,0,1,1,1,1,...,1.053743,0.999998,0.999998,0.999998,-0.53874,-0.449378,0.0,0.0,0.0,E


In [49]:
test_data.head()

Unnamed: 0,기준년월,ID,남녀구분코드,연령,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,소지카드수_이용가능_신용,...,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M,pred_label
0,201807,TEST_00000,1,40대,1,1,0,1,2,2,...,0.999998,0.999998,0.999998,0.999998,0.209395,0.231043,0.0,1.33277,1.780392,E
1,201807,TEST_00001,1,60대,1,1,0,1,1,1,...,1.044473,1.991974,0.999998,0.926569,-0.269161,-0.247241,0.0,0.0,0.0,E
2,201807,TEST_00002,1,40대,1,1,1,1,2,2,...,1.053083,0.999998,0.999998,0.999998,-0.12029,0.02927,0.0,4.123738,5.115589,D
3,201807,TEST_00003,2,40대,1,1,1,1,1,1,...,1.99163,0.999998,0.999998,0.999998,0.035807,-0.013359,0.0,0.093615,0.349994,E
4,201807,TEST_00004,2,40대,1,0,1,1,1,1,...,1.053743,0.999998,0.999998,0.999998,-0.53874,-0.449378,0.0,0.0,0.0,E


In [26]:
submission = test_data.groupby("ID")["pred_label"] \
    .agg(lambda x: x.value_counts().idxmax()) \
    .reset_index()

submission.columns = ["ID", "Segment"]

In [28]:
submission.to_csv('/content/drive/MyDrive/creditcard_catboost.csv',index=False)