In [1]:
# 기본 수학 / 배열 / 데이터프레임 처리
import numpy as np
import pandas as pd

# PyTorch - 딥러닝 프레임워크
import torch
import torch.nn as nn
import torch.optim as optim

# 데이터 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df_sample = pd.read_csv("data-files/sample_submission.csv")
df_test = pd.read_csv("data-files/test.csv")
df_train = pd.read_csv("data-files/train.csv")

In [3]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1755 entries, 0 to 1754
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      1755 non-null   object
 1   성공확률    1755 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 27.6+ KB


In [4]:
df_sample.head()

Unnamed: 0,ID,성공확률
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4376 entries, 0 to 4375
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              4376 non-null   object 
 1   설립연도            4376 non-null   int64  
 2   국가              4376 non-null   object 
 3   분야              3519 non-null   object 
 4   투자단계            4376 non-null   object 
 5   직원 수            4202 non-null   float64
 6   인수여부            4376 non-null   object 
 7   상장여부            4376 non-null   object 
 8   고객수(백만명)        3056 non-null   float64
 9   총 투자금(억원)       4376 non-null   float64
 10  연매출(억원)         4376 non-null   float64
 11  SNS 팔로워 수(백만명)  4376 non-null   float64
 12  기업가치(백억원)       3156 non-null   object 
 13  성공확률            4376 non-null   float64
dtypes: float64(6), int64(1), object(7)
memory usage: 478.8+ KB


In [6]:
df_train.head()

Unnamed: 0,ID,설립연도,국가,분야,투자단계,직원 수,인수여부,상장여부,고객수(백만명),총 투자금(억원),연매출(억원),SNS 팔로워 수(백만명),기업가치(백억원),성공확률
0,TRAIN_0000,2009,CT005,이커머스,Series A,4126.0,No,No,56.0,3365.0,4764.0,4.71,,0.3
1,TRAIN_0001,2023,CT006,핀테크,Seed,4167.0,Yes,No,80.0,4069.0,279.0,1.0,2500-3500,0.8
2,TRAIN_0002,2018,CT007,기술,Series A,3132.0,Yes,Yes,54.0,6453.0,12141.0,4.0,3500-4500,0.5
3,TRAIN_0003,2016,CT006,,Seed,3245.0,Yes,Yes,,665.0,10547.0,2.97,,0.7
4,TRAIN_0004,2020,CT002,에듀테크,Seed,1969.0,No,Yes,94.0,829.0,9810.0,1.0,1500-2500,0.1


In [7]:
df_train.columns

Index(['ID', '설립연도', '국가', '분야', '투자단계', '직원 수', '인수여부', '상장여부', '고객수(백만명)',
       '총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)', '기업가치(백억원)', '성공확률'],
      dtype='object')

In [8]:
df_sample.columns

Index(['ID', '성공확률'], dtype='object')

In [9]:
df_test.columns

Index(['ID', '설립연도', '국가', '분야', '투자단계', '직원 수', '인수여부', '상장여부', '고객수(백만명)',
       '총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)', '기업가치(백억원)'],
      dtype='object')

In [10]:
# 문자열 범위 처리 함수
def handle_range(x):
    if isinstance(x, str):
        x = x.replace(',', '')  # 쉼표 제거
        if '-' in x:
            low, high = x.split('-')
            return (float(low) + float(high)) / 2
        else:
            # '6000이상' 같은 경우 숫자만 추출
            num = ''.join(filter(str.isdigit, x))
            return float(num)
    else:
        return float(x)

# 총 투자금, 연매출, 기업가치 세 컬럼에 적용
for col in ['총 투자금(억원)', '연매출(억원)', '기업가치(백억원)']:
    df_train[col] = df_train[col].apply(handle_range)
    df_test[col] = df_test[col].apply(handle_range)

In [11]:
# 각 컬럼별 평균값으로 NaN 채우기
for col in ['총 투자금(억원)', '연매출(억원)', '기업가치(백억원)']:
    mean_value = df_train[col].mean()
    df_train[col] = df_train[col].fillna(mean_value)
    df_test[col] = df_test[col].fillna(mean_value)

In [12]:
# 순이익이 많을 수록 성공률 높을 것이라 예상

df_train['순이익'] = df_train['연매출(억원)'] - df_train['총 투자금(억원)']
df_test['순이익'] = df_test['연매출(억원)'] - df_test['총 투자금(억원)']

# 사용할 feature 고르기
features = ['순이익', '기업가치(백억원)']

X = df_train[features]
y = df_train['성공확률']
X_test = df_test[features]

# 스케일링
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 간단히 재정의
class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(2, 32)  # 입력이 2개니까
        self.fc2 = nn.Linear(32, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SimpleMLP()

# 손실함수 + optimizer
criterion = nn.BCEWithLogitsLoss()  # !!! BCEWithLogitsLoss
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# 학습
epochs = 300
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(torch.FloatTensor(X_train))
    loss = criterion(output, torch.FloatTensor(y_train.values).view(-1, 1))
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        model.eval()
        val_output = model(torch.FloatTensor(X_val))
        val_loss = criterion(val_output, torch.FloatTensor(y_val.values).view(-1, 1))
        print(f"Epoch {epoch}, Train Loss: {loss.item()}, Val Loss: {val_loss.item()}")

Epoch 0, Train Loss: 0.697776734828949, Val Loss: 0.7031059861183167
Epoch 50, Train Loss: 0.6914556622505188, Val Loss: 0.6928426027297974
Epoch 100, Train Loss: 0.690337598323822, Val Loss: 0.6904001235961914
Epoch 150, Train Loss: 0.6901139616966248, Val Loss: 0.6900559663772583
Epoch 200, Train Loss: 0.6899905204772949, Val Loss: 0.6900949478149414
Epoch 250, Train Loss: 0.6898946762084961, Val Loss: 0.6901707053184509


In [13]:
# # 예측
# model.eval()
# preds = model(torch.FloatTensor(X_test))
# preds = torch.sigmoid(preds)  # BCEWithLogitsLoss 썼으면 반드시 sigmoid 필요
# preds = preds.detach().numpy().flatten()

In [14]:
# # sample_submission 채우기
# df_sample['성공확률'] = preds
# df_sample.to_csv('submission.csv', index=False)

In [15]:
# model.eval()

# # X_test를 torch로 변환
# X_test_tensor = torch.FloatTensor(X_test)

# # 모델에 넣고 예측
# preds = model(X_test_tensor)

# # BCEWithLogitsLoss 썼으니까 반드시 sigmoid 통과시켜야 함
# preds = torch.sigmoid(preds)

# # detach하고 numpy로 변환
# preds = preds.detach().numpy().flatten()

# # sample_submission에 넣기
# df_sample['성공확률'] = preds
# df_sample.to_csv('submission.csv', index=False)

In [16]:
# # (1) 모델을 평가 모드로
# model.eval()

# # (2) X_test를 Tensor로 변환
# X_test_tensor = torch.FloatTensor(X_test)

# # (3) 모델에 넣고 예측
# preds = model(X_test_tensor)

# # (4) 반드시 sigmoid 통과시키기
# preds = torch.sigmoid(preds)

# # (5) detach -> numpy 변환 -> flatten
# preds = preds.detach().numpy().flatten()

# # (6) preds 내용 한번 체크
# print("Preds min:", preds.min(), "Preds max:", preds.max())

# # (7) sample_submission에 넣기
# df_sample['성공확률'] = preds

# # (8) 저장
# df_sample.to_csv('submission.csv', index=False)

In [17]:
# 1. 모델 평가모드로
model.eval()

# 2. X_test 변환
X_test_tensor = torch.FloatTensor(X_test)

# 3. 예측
preds = model(X_test_tensor)

# 4. sigmoid 통과
preds = torch.sigmoid(preds)

# 5. detach + numpy 변환 + flatten
preds = preds.detach().numpy().flatten()

# 6. 예측 결과 직접 확인
print("Preds min:", preds.min(), "Preds max:", preds.max())

Preds min: 0.5034465 Preds max: 0.58558786


In [18]:
print(X_train.shape)

(3500, 2)


In [19]:
import torch

# numpy → Tensor 변환
X_train_tensor = torch.FloatTensor(X_train)

# NaN / Inf 검사
print("NaN 개수:", torch.isnan(X_train_tensor).sum().item())
print("Inf 개수:", torch.isinf(X_train_tensor).sum().item())

# min, max 값도 체크
print("최소값:", X_train_tensor.min().item())
print("최대값:", X_train_tensor.max().item())

NaN 개수: 0
Inf 개수: 0
최소값: -2.312941551208496
최대값: 2.656660556793213


In [20]:
print(torch.isnan(X_train_tensor).sum())
print(torch.isinf(X_train_tensor).sum())
print(X_train_tensor.min(), X_train_tensor.max())

tensor(0)
tensor(0)
tensor(-2.3129) tensor(2.6567)


In [21]:
print("NaN 개수:", torch.isnan(X_train_tensor).sum().item())
print("Inf 개수:", torch.isinf(X_train_tensor).sum().item())
print("최소값:", X_train_tensor.min().item())
print("최대값:", X_train_tensor.max().item())

NaN 개수: 0
Inf 개수: 0
최소값: -2.312941551208496
최대값: 2.656660556793213


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
import torch.nn as nn
import torch.optim as optim

class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(2, 32)
        self.fc2 = nn.Linear(32, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

model = SimpleMLP()

# 손실함수와 optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Tensor 변환
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val.values).view(-1, 1)

In [24]:
epochs = 300
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_tensor)
    loss = criterion(output, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        model.eval()
        val_output = model(X_val_tensor)
        val_loss = criterion(val_output, y_val_tensor)
        print(f"Epoch {epoch}, Train Loss: {loss.item()}, Val Loss: {val_loss.item()}")


Epoch 0, Train Loss: 0.6991621255874634, Val Loss: 0.7038846611976624
Epoch 50, Train Loss: 0.6927038431167603, Val Loss: 0.6928607821464539
Epoch 100, Train Loss: 0.6911978721618652, Val Loss: 0.6904335618019104
Epoch 150, Train Loss: 0.690545916557312, Val Loss: 0.6899673342704773
Epoch 200, Train Loss: 0.6901723742485046, Val Loss: 0.6899319291114807
Epoch 250, Train Loss: 0.6899355053901672, Val Loss: 0.6900262832641602


In [25]:
model.eval()
X_test_tensor = torch.FloatTensor(X_test)
preds = model(X_test_tensor)
preds = torch.sigmoid(preds)
preds = preds.detach().numpy().flatten()

df_sample['성공확률'] = preds
df_sample.to_csv('submission.csv', index=False)