In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/health-insurance-cross-sell-prediction-data/train.csv
/kaggle/input/health-insurance-cross-sell-prediction-data/test.csv
/kaggle/input/playground-series-s4e7/sample_submission.csv
/kaggle/input/playground-series-s4e7/train.csv
/kaggle/input/playground-series-s4e7/test.csv


In [2]:
import pandas as pd
import numpy as np

train = pd.DataFrame(pd.read_csv("/kaggle/input/playground-series-s4e7/train.csv"))
test = pd.DataFrame(pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv"))
submission = pd.DataFrame(pd.read_csv("/kaggle/input/playground-series-s4e7/sample_submission.csv"))
origin_train = pd.DataFrame(pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction-data/train.csv'))

In [3]:
train = pd.concat([train,origin_train])

In [4]:
def reduce_memory_usage(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
    return df

def preprocess_data(df):
    df['Previously_Insured_Annual_Premium'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str))[0]
    df['Previously_Insured_Vintage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vintage'].astype(str))[0]
    
    # 'id' 컬럼이 있을 경우 삭제
    if 'id' in df.columns:
        df = df.drop(columns=['id'])
    
    # 데이터 타입 변환
    df['Region_Code'] = df['Region_Code'].astype(int)
    df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(int)
    df['Annual_Premium'] = df['Annual_Premium'].astype(int)
    
    for col in df.columns:
        if df[col].dtype == bool:
            df[col] = df[col].astype(int)
    
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Vehicle_Age'] = df['Vehicle_Age'].map({'> 2 Years': 2, '1-2 Year': 1, '< 1 Year': 0})
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
    
    df = reduce_memory_usage(df)
    
    return df

In [5]:
train = preprocess_data(train)
test = preprocess_data(test)

In [6]:
cat_features = list(train.columns.values)
remove_list = ['Age','Annual_Premium','Vintage','Vehicle_Age','Response']
for i in remove_list:
    cat_features.remove(i)

In [7]:
from catboost import CatBoostClassifier, Pool
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import gc

random_state = 777
num_dataset = 0

# 데이터 로드 및 변환
features = train.drop(columns=['Response'])
target = train['Response']

# CatBoost는 CatFeatures를 문자열로 변환해야 합니다.
for col in cat_features:
    features[col] = features[col].astype(str)
    test[col] = test[col].astype(str)

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.01, stratify=target, random_state=random_state)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_val, y_val, cat_features=cat_features)

del features, target, X_train, y_train, train
gc.collect()

# CatBoost 모델 설정
model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='Logloss',
    learning_rate=0.05,
    iterations=1500,
    fold_permutation_block=1,
    random_strength=0,
    l2_leaf_reg=5,
    task_type='GPU',
    random_seed=random_state,
    verbose=500,
    allow_writing_files=False
)

# model.load_model('/kaggle/working/catboost_model.cbm')

# 모델 학습
model.fit(X=train_pool, eval_set=valid_pool, early_stopping_rounds=200)

# 예측 및 성능 평가 (Validation set)
y_pred = model.predict(valid_pool)
y_pred_prob = model.predict_proba(valid_pool)[:, 1]  # 예측 확률

# 성능 지표 계산
accuracy_all = accuracy_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred_prob)

# 성능 지표 출력
print(f'Accuracy: {accuracy_all:.4f}')
print(f'Recall: {recall:.4f}')
print(f'Precision: {precision:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'AUC: {auc:.4f}')

0:	learn: 0.6085881	test: 0.6081490	best: 0.6081490 (0)	total: 6.46s	remaining: 2h 41m 16s
500:	learn: 0.2435909	test: 0.2407489	best: 0.2407489 (500)	total: 8m 25s	remaining: 16m 48s
1000:	learn: 0.2428963	test: 0.2401716	best: 0.2401716 (1000)	total: 16m 53s	remaining: 8m 25s
1499:	learn: 0.2425411	test: 0.2399265	best: 0.2399265 (1499)	total: 25m 13s	remaining: 0us
bestTest = 0.2399265023
bestIteration = 1499


In [None]:
# 예측 결과 저장
# test_pool = Pool(test, cat_features=X_train.columns.values)
test_pool = Pool(test, cat_features=cat_features)
submission['Response'] = model.predict_proba(test_pool)[:, 1]
submission.to_csv('submission.csv', index=False)