In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df_sample = pd.read_csv("data-files/sample_submission.csv")
df_test = pd.read_csv("data-files/test.csv")
df_train = pd.read_csv("data-files/train.csv")

In [3]:
# 결측치 처리
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [4]:
df_train.columns

Index(['ID', '설립연도', '국가', '분야', '투자단계', '직원 수', '인수여부', '상장여부', '고객수(백만명)',
       '총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)', '기업가치(백억원)', '성공확률'],
      dtype='object')

In [6]:
# 기업가치 컬럼 숫자로 바꾸기
def parse_valuation(v):
    if pd.isna(v):
        return np.nan
    v = str(v).replace(",","") # 콤마 제거(just in case)
    if '-' in v: # 범위형이면 평균 처리
        low, high = v.split('-')
        return (float(low) + float(high)) / 2
    else:
        # 숫자만 남기고 문자 삭제 (ex: 6000이상 -> 6000)
        num = ''.join(filter(str.isdigit,v))
        return float(num)

df_train['기업가치(백억원)'] = df_train['기업가치(백억원)'].apply(parse_valuation)
df_test['기업가치(백억원)'] = df_test['기업가치(백억원)'].apply(parse_valuation)

In [7]:
# feature 가공
# 순이익 column 추가
df_train['순이익'] = df_train['연매출(억원)'] - df_train['총 투자금(억원)']
df_test['순이익'] = df_test['연매출(억원)'] - df_test['총 투자금(억원)']

In [8]:
# 사용할 feature 설정
features = ['총 투자금(억원)', '연매출(억원)', '기업가치(백억원)', '순이익', 'SNS 팔로워 수(백만명)', '고객수(백만명)', '직원 수']

X = df_train[features]
y = df_train['성공확률']
X_test = df_test[features]

In [9]:
# 스케일링(표준화)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [10]:
# train / validation 분리하기
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# MLP
class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(len(features), 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
model = SimpleMLP()

In [22]:
# 학습 준비
criterion = nn.BCEWithLogitsLoss() # 시그모이드 없이 로짓 + BCE 한방 처리
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [23]:
# Tensor로 변환
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val.values).view(-1, 1)
X_test_tensor = torch.FloatTensor(X_test)

In [24]:
# 학습
epochs = 500
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_tensor)
    # print(type(output))
    loss = criterion(output, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        model.eval()
        val_output = model(X_val_tensor)
        val_loss = criterion(val_output, y_val_tensor)
        print(f"Epoch {epoch}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

Epoch 0, Train Loss: 0.6907, Val Loss: 0.6889
Epoch 50, Train Loss: 0.6873, Val Loss: 0.6918
Epoch 100, Train Loss: 0.6837, Val Loss: 0.6934
Epoch 150, Train Loss: 0.6784, Val Loss: 0.6948
Epoch 200, Train Loss: 0.6733, Val Loss: 0.6963
Epoch 250, Train Loss: 0.6689, Val Loss: 0.6980
Epoch 300, Train Loss: 0.6652, Val Loss: 0.6996
Epoch 350, Train Loss: 0.6622, Val Loss: 0.7025
Epoch 400, Train Loss: 0.6598, Val Loss: 0.7039
Epoch 450, Train Loss: 0.6577, Val Loss: 0.7060


In [25]:
# 예측
model.eval()
preds = model(X_test_tensor).detach().numpy().flatten()
preds = 1 / (1 + np.exp(-preds))  # BCEWithLogitsLoss 썼으니까 마지막에 sigmoid 적용

In [26]:
#결과 저장
df_sample['성공확률'] = preds
df_sample.to_csv('submission.csv', index=False)

print("제출 파일 저장 완료: submission.csv")

제출 파일 저장 완료: submission.csv
