In [1]:
# 기본 수학 / 배열 / 데이터프레임 처리
import numpy as np
import pandas as pd

# PyTorch - 딥러닝 프레임워크
import torch
import torch.nn as nn
import torch.optim as optim

# 데이터 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df_sample = pd.read_csv("data-files/sample_submission.csv")
df_test = pd.read_csv("data-files/test.csv")
df_train = pd.read_csv("data-files/train.csv")

In [3]:
df_train.columns

Index(['ID', '설립연도', '국가', '분야', '투자단계', '직원 수', '인수여부', '상장여부', '고객수(백만명)',
       '총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)', '기업가치(백억원)', '성공확률'],
      dtype='object')

In [4]:
# 문자열 범위 처리 함수
def handle_range(x):
    if isinstance(x, str):
        x = x.replace(',', '')  # 쉼표 제거
        if '-' in x:
            low, high = x.split('-')
            return (float(low) + float(high)) / 2
        else:
            # '6000이상' 같은 경우 숫자만 추출
            num = ''.join(filter(str.isdigit, x))
            return float(num)
    else:
        return float(x)

# 총 투자금, 연매출, 기업가치 세 컬럼에 적용
for col in ['총 투자금(억원)', '연매출(억원)', '기업가치(백억원)']:
    df_train[col] = df_train[col].apply(handle_range)
    df_test[col] = df_test[col].apply(handle_range)

In [5]:
# 각 컬럼별 평균값으로 NaN 채우기
for col in ['총 투자금(억원)', '연매출(억원)', '기업가치(백억원)']:
    mean_value = df_train[col].mean()
    df_train[col] = df_train[col].fillna(mean_value)
    df_test[col] = df_test[col].fillna(mean_value)

In [6]:
# 순이익이 많을 수록 성공률 높을 것이라 예상

df_train['순이익'] = df_train['연매출(억원)'] - df_train['총 투자금(억원)']
df_test['순이익'] = df_test['연매출(억원)'] - df_test['총 투자금(억원)']

# 사용할 feature 고르기
features = ['순이익', '기업가치(백억원)']

X = df_train[features]
y = df_train['성공확률']
X_test = df_test[features]

# 스케일링
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 간단히 재정의
class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(2, 32)  # 입력이 2개니까
        self.fc2 = nn.Linear(32, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SimpleMLP()

# 손실함수 + optimizer
criterion = nn.BCEWithLogitsLoss()  # !!! BCEWithLogitsLoss
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# 학습
epochs = 300
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(torch.FloatTensor(X_train))
    loss = criterion(output, torch.FloatTensor(y_train.values).view(-1, 1))
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        model.eval()
        val_output = model(torch.FloatTensor(X_val))
        val_loss = criterion(val_output, torch.FloatTensor(y_val.values).view(-1, 1))
        print(f"Epoch {epoch}, Train Loss: {loss.item()}, Val Loss: {val_loss.item()}")

Epoch 0, Train Loss: 0.691472589969635, Val Loss: 0.6914348602294922
Epoch 50, Train Loss: 0.6900725960731506, Val Loss: 0.6899082660675049
Epoch 100, Train Loss: 0.6899009943008423, Val Loss: 0.6902512311935425
Epoch 150, Train Loss: 0.6897849440574646, Val Loss: 0.6903378963470459
Epoch 200, Train Loss: 0.6896829009056091, Val Loss: 0.6903636455535889
Epoch 250, Train Loss: 0.689591646194458, Val Loss: 0.6904119253158569


In [7]:
# # 예측
# model.eval()
# preds = model(torch.FloatTensor(X_test))
# preds = torch.sigmoid(preds)  # BCEWithLogitsLoss 썼으면 반드시 sigmoid 필요
# preds = preds.detach().numpy().flatten()

In [8]:
# # sample_submission 채우기
# df_sample['성공확률'] = preds
# df_sample.to_csv('submission.csv', index=False)

In [9]:
# model.eval()

# # X_test를 torch로 변환
# X_test_tensor = torch.FloatTensor(X_test)

# # 모델에 넣고 예측
# preds = model(X_test_tensor)

# # BCEWithLogitsLoss 썼으니까 반드시 sigmoid 통과시켜야 함
# preds = torch.sigmoid(preds)

# # detach하고 numpy로 변환
# preds = preds.detach().numpy().flatten()

# # sample_submission에 넣기
# df_sample['성공확률'] = preds
# df_sample.to_csv('submission.csv', index=False)

In [10]:
# # (1) 모델을 평가 모드로
# model.eval()

# # (2) X_test를 Tensor로 변환
# X_test_tensor = torch.FloatTensor(X_test)

# # (3) 모델에 넣고 예측
# preds = model(X_test_tensor)

# # (4) 반드시 sigmoid 통과시키기
# preds = torch.sigmoid(preds)

# # (5) detach -> numpy 변환 -> flatten
# preds = preds.detach().numpy().flatten()

# # (6) preds 내용 한번 체크
# print("Preds min:", preds.min(), "Preds max:", preds.max())

# # (7) sample_submission에 넣기
# df_sample['성공확률'] = preds

# # (8) 저장
# df_sample.to_csv('submission.csv', index=False)

In [11]:
# 1. 모델 평가모드로
model.eval()

# 2. X_test 변환
X_test_tensor = torch.FloatTensor(X_test)

# 3. 예측
preds = model(X_test_tensor)

# 4. sigmoid 통과
preds = torch.sigmoid(preds)

# 5. detach + numpy 변환 + flatten
preds = preds.detach().numpy().flatten()

# 6. 예측 결과 직접 확인
print("Preds min:", preds.min(), "Preds max:", preds.max())

Preds min: 0.4866236 Preds max: 0.6022961


In [12]:
print(X_train.shape)

(3500, 2)


In [13]:
import torch

# numpy → Tensor 변환
X_train_tensor = torch.FloatTensor(X_train)

# NaN / Inf 검사
print("NaN 개수:", torch.isnan(X_train_tensor).sum().item())
print("Inf 개수:", torch.isinf(X_train_tensor).sum().item())

# min, max 값도 체크
print("최소값:", X_train_tensor.min().item())
print("최대값:", X_train_tensor.max().item())

NaN 개수: 0
Inf 개수: 0
최소값: -2.312941551208496
최대값: 2.656660556793213


In [14]:
print(torch.isnan(X_train_tensor).sum())
print(torch.isinf(X_train_tensor).sum())
print(X_train_tensor.min(), X_train_tensor.max())

tensor(0)
tensor(0)
tensor(-2.3129) tensor(2.6567)


In [15]:
print("NaN 개수:", torch.isnan(X_train_tensor).sum().item())
print("Inf 개수:", torch.isinf(X_train_tensor).sum().item())
print("최소값:", X_train_tensor.min().item())
print("최대값:", X_train_tensor.max().item())

NaN 개수: 0
Inf 개수: 0
최소값: -2.312941551208496
최대값: 2.656660556793213


In [16]:
# 순이익/기업가치 비율
df_train['순이익대비기업가치'] = df_train['순이익'] / (df_train['기업가치(백억원)'] + 1e-6)  # 0 나누기 방지
df_test['순이익대비기업가치'] = df_test['순이익'] / (df_test['기업가치(백억원)'] + 1e-6)

In [17]:
# 새로 쓸 feature 리스트
features = ['순이익', '기업가치(백억원)', '순이익대비기업가치']

X = df_train[features]
y = df_train['성공확률']
X_test = df_test[features]

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
import numpy as np

class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(3, 32)  # 입력 차원 3개로 변경!!
        self.fc2 = nn.Linear(32, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 0
fold_losses = []

for train_idx, val_idx in kf.split(X):
    fold += 1
    print(f"\n===== Fold {fold} =====")

    X_train_fold, X_val_fold = X[train_idx], X[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    model = SimpleMLP()
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0005)

    X_train_tensor = torch.FloatTensor(X_train_fold)
    y_train_tensor = torch.FloatTensor(y_train_fold.values).view(-1, 1)
    X_val_tensor = torch.FloatTensor(X_val_fold)
    y_val_tensor = torch.FloatTensor(y_val_fold.values).view(-1, 1)

    epochs = 300
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = criterion(output, y_train_tensor)
        loss.backward()
        optimizer.step()

    model.eval()
    val_output = model(X_val_tensor)
    val_loss = criterion(val_output, y_val_tensor)
    print(f"Fold {fold} Validation Loss: {val_loss.item():.4f}")
    fold_losses.append(val_loss.item())

print(f"\n===== 평균 Validation Loss: {np.mean(fold_losses):.4f} =====")



===== Fold 1 =====
Fold 1 Validation Loss: 0.6905

===== Fold 2 =====
Fold 2 Validation Loss: 0.6900

===== Fold 3 =====
Fold 3 Validation Loss: 0.6897

===== Fold 4 =====
Fold 4 Validation Loss: 0.6916

===== Fold 5 =====
Fold 5 Validation Loss: 0.6905

===== 평균 Validation Loss: 0.6904 =====


In [21]:
# 최종 모델 학습 (전체 데이터로)
model = SimpleMLP()
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

X_tensor = torch.FloatTensor(X)
y_tensor = torch.FloatTensor(y.values).view(-1, 1)

epochs = 300
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_tensor)
    loss = criterion(output, y_tensor)
    loss.backward()
    optimizer.step()

In [22]:
# Test 데이터 예측
model.eval()
X_test_tensor = torch.FloatTensor(X_test)
preds = model(X_test_tensor)
preds = torch.sigmoid(preds)  # BCEWithLogitsLoss 썼으니까 반드시 sigmoid
preds = preds.detach().numpy().flatten()

In [23]:
# sample_submission 채워서 저장
df_sample['성공확률'] = preds
df_sample.to_csv('submission.csv', index=False)
print("submission.csv 저장 완료")

submission.csv 저장 완료
