In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
df_sample = pd.read_csv("data-files/sample_submission.csv")
df_test = pd.read_csv("data-files/test.csv")
df_train = pd.read_csv("data-files/train.csv")

In [5]:
# ===== 기업가치 숫자로 변환 ===== #
def parse_valuation(v):
    if pd.isna(v):
        return np.nan
    v = str(v).replace(",", "")
    if '-' in v:
        low, high = v.split('-')
        return (float(low) + float(high)) / 2
    else:
        return float(''.join(filter(str.isdigit, v)))

df_train['기업가치(백억원)'] = df_train['기업가치(백억원)'].apply(parse_valuation)
df_test['기업가치(백억원)'] = df_test['기업가치(백억원)'].apply(parse_valuation)

In [6]:
# ===== log 변환 (왜곡 심한 피처들) ===== #
log_features = ['총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)', '기업가치(백억원)', '고객수(백만명)']
for col in log_features:
    df_train[col] = np.log1p(df_train[col])
    df_test[col] = np.log1p(df_test[col])

In [7]:
# ===== 파생 변수 추가 ===== #
df_train['회사나이'] = 2025 - df_train['설립연도']
df_test['회사나이'] = 2025 - df_test['설립연도']

df_train['순이익'] = df_train['연매출(억원)'] - df_train['총 투자금(억원)']
df_test['순이익'] = df_test['연매출(억원)'] - df_test['총 투자금(억원)']

df_train['투자금_비율'] = df_train['연매출(억원)'] / (df_train['총 투자금(억원)'] + 1e-6)
df_test['투자금_비율'] = df_test['연매출(억원)'] / (df_test['총 투자금(억원)'] + 1e-6)

df_train['가치_비율'] = df_train['기업가치(백억원)'] / (df_train['총 투자금(억원)'] + 1e-6)
df_test['가치_비율'] = df_test['기업가치(백억원)'] / (df_test['총 투자금(억원)'] + 1e-6)

In [8]:
# ===== 범주형 인코딩 ===== #
categorical_cols = ['국가', '투자단계', '분야']
for col in categorical_cols:
    df_train[col] = df_train[col].fillna("Unknown")
    df_test[col] = df_test[col].fillna("Unknown")
    
    le = LabelEncoder()
    all_data = pd.concat([df_train[col], df_test[col]])
    le.fit(all_data)
    
    df_train[col] = le.transform(df_train[col])
    df_test[col] = le.transform(df_test[col])

In [9]:
# ===== 사용할 피처 설정 ===== #
features = [
    '총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)',
    '기업가치(백억원)', '고객수(백만명)', '회사나이', '순이익',
    '투자금_비율', '가치_비율', '국가', '투자단계', '분야'
]
target = '성공확률'

X = df_train[features].fillna(0)
y = df_train[target]
X_test = df_test[features].fillna(0)

In [10]:
# ===== KFold 설정 ===== #
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

fold = 0
for train_idx, val_idx in kf.split(X):
    fold += 1
    print(f"===== Fold {fold} =====")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = GradientBoostingRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    val_pred = model.predict(X_val)
    oof_preds[val_idx] = val_pred
    test_preds += model.predict(X_test) / 5  # 5번 평균
    
    val_loss = np.mean(np.abs(y_val - val_pred))
    print(f"Fold {fold} Validation MAE: {val_loss:.4f}")

===== Fold 1 =====
Fold 1 Validation MAE: 0.2069
===== Fold 2 =====
Fold 2 Validation MAE: 0.2069
===== Fold 3 =====
Fold 3 Validation MAE: 0.2029
===== Fold 4 =====
Fold 4 Validation MAE: 0.2031
===== Fold 5 =====
Fold 5 Validation MAE: 0.2060


In [11]:
# ===== 전체 Validation 결과 출력 ===== #
total_val_loss = np.mean(np.abs(y - oof_preds))
print(f"\n===== Overall Validation MAE: {total_val_loss:.4f} =====")


===== Overall Validation MAE: 0.2052 =====


In [13]:
# ===== 제출파일 저장 ===== #
df_sample['성공확률'] = test_preds
df_sample.to_csv('submission.csv', index=False)
print("제출 파일 저장 완료: submission.csv")

제출 파일 저장 완료: submission.csv
