In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder

In [3]:
df_sample = pd.read_csv("data-files/sample_submission.csv")
df_test = pd.read_csv("data-files/test.csv")
df_train = pd.read_csv("data-files/train.csv")

In [5]:
# ===== 2. 기업가치 전처리 ===== #
def parse_valuation(v):
    if pd.isna(v):
        return np.nan
    v = str(v).replace(",", "")
    if '-' in v:
        low, high = v.split('-')
        return (float(low) + float(high)) / 2
    else:
        return float(''.join(filter(str.isdigit, v)))

df_train['기업가치(백억원)'] = df_train['기업가치(백억원)'].apply(parse_valuation)
df_test['기업가치(백억원)'] = df_test['기업가치(백억원)'].apply(parse_valuation)

# ===== 3. log 변환할 컬럼 ===== #
log_features = ['총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)', '기업가치(백억원)', '고객수(백만명)']

for col in log_features:
    df_train[col] = np.log1p(df_train[col])
    df_test[col] = np.log1p(df_test[col])

# ===== 4. 파생 피처 ===== #
df_train['회사나이'] = 2025 - df_train['설립연도']
df_test['회사나이'] = 2025 - df_test['설립연도']

df_train['순이익'] = df_train['연매출(억원)'] - df_train['총 투자금(억원)']
df_test['순이익'] = df_test['연매출(억원)'] - df_test['총 투자금(억원)']

df_train['투자금_비율'] = df_train['연매출(억원)'] / (df_train['총 투자금(억원)'] + 1e-6)
df_test['투자금_비율'] = df_test['연매출(억원)'] / (df_test['총 투자금(억원)'] + 1e-6)

df_train['가치_비율'] = df_train['기업가치(백억원)'] / (df_train['총 투자금(억원)'] + 1e-6)
df_test['가치_비율'] = df_test['기업가치(백억원)'] / (df_test['총 투자금(억원)'] + 1e-6)

# ===== 5. 범주형 인코딩 ===== #
categorical_cols = ['국가', '투자단계', '분야']
for col in categorical_cols:
    df_train[col] = df_train[col].fillna("Unknown")
    df_test[col] = df_test[col].fillna("Unknown")
    
    le = LabelEncoder()
    all_data = pd.concat([df_train[col], df_test[col]])
    le.fit(all_data)
    
    df_train[col] = le.transform(df_train[col])
    df_test[col] = le.transform(df_test[col])

# ===== 6. 사용할 피처 ===== #
features = [
    '총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)',
    '기업가치(백억원)', '고객수(백만명)', '회사나이', '순이익',
    '투자금_비율', '가치_비율', '국가', '투자단계', '분야'
]
target = '성공확률'

X = df_train[features].fillna(0)
y = df_train[target]
X_test = df_test[features].fillna(0)

# ===== 7. 학습 / 검증 분리 ===== #
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ===== 8. 모델 학습 ===== #
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# ===== 9. 예측 및 저장 ===== #
preds = model.predict(X_test)
df_sample['성공확률'] = preds
df_sample.to_csv("submission.csv", index=False)