In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df_sample = pd.read_csv("data-files/sample_submission.csv")
df_test = pd.read_csv("data-files/test.csv")
df_train = pd.read_csv("data-files/train.csv")

In [6]:
# 기업가치 전처리
def parse_valuation(v):
    if pd.isna(v):
        return np.nan
    v = str(v).replace(",", "")
    if '-' in v:
        low, high = v.split('-')
        return (float(low) + float(high)) / 2
    else:
        return float(''.join(filter(str.isdigit, v)))
    
df_train['기업가치(백억원)'] = df_train['기업가치(백억원)'].apply(parse_valuation)
df_test['기업가치(백억원)'] = df_test['기업가치(백억원)'].apply(parse_valuation)

In [8]:
# Feature 설정 + Log 변환
features = ['총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)', '기업가치(백억원)']
target = '성공확률'

for col in features:
    df_train[col] = df_train[col].apply(lambda x: np.log1p(x) if pd.notna(x) else 0)
    df_test[col] = df_test[col].apply(lambda x: np.log1p(x) if pd.notna(x) else 0)

df_train[features] = df_train[features].fillna(0)
df_test[features] = df_test[features].fillna(0)

In [9]:
# 데이터 나누기
X = df_train[features]
y = df_train[target]
X_test = df_test[features]

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# 모델 정의 (회귀)
class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(len(features), 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

model = SimpleMLP()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [11]:
# 텐서 변환
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val.values).view(-1, 1)
X_test_tensor = torch.FloatTensor(X_test)

In [12]:
# 학습
epochs = 300
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_tensor)
    loss = criterion(output, y_train_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 50 == 0:
        model.eval()
        val_output = model(X_val_tensor)
        val_loss = criterion(val_output, y_val_tensor)
        print(f"Epoch {epoch}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

Epoch 0, Train Loss: 0.4416, Val Loss: 0.4122
Epoch 50, Train Loss: 0.0652, Val Loss: 0.0657
Epoch 100, Train Loss: 0.0596, Val Loss: 0.0605
Epoch 150, Train Loss: 0.0584, Val Loss: 0.0595
Epoch 200, Train Loss: 0.0581, Val Loss: 0.0594
Epoch 250, Train Loss: 0.0579, Val Loss: 0.0594


In [14]:
# 예측 및 저장
model.eval()
preds = model(X_test_tensor).detach().numpy().flatten()
df_sample['성공확률'] = preds
df_sample.to_csv('submission.csv', index=False)