In [1]:
# library import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

In [2]:
# gpu 사용하기
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [3]:
# 데이터 가져오기
train_data = pd.read_csv("train.csv")
train_data_drop = train_data.dropna()

# 데이터 쪼개기 => 결측치 제거한 값으로
idx1to26_drop = train_data_drop.iloc[:, 1:27]
idx28to40_drop = train_data_drop.iloc[:, 28:41]
idx41to56_drop = train_data_drop.iloc[:, 41:57]
idx57to68_drop = train_data_drop.iloc[:, 57:69]
target_drop = train_data_drop.iloc[:, 69]

In [4]:
from sklearn.model_selection import train_test_split
main_train_input, main_test_input, main_train_target, main_test_target = train_test_split(idx1to26_drop, target_drop)

In [5]:
# CustomDataset
class CustomDataset(Dataset):
  def __init__(self, input, target):
    self.input = input
    self.target = target
  
  def __len__(self):
    return len(self.target)

  def __getitem__(self, idx):
    sample = torch.tensor(self.input.iloc[idx, :]).float()
    label = torch.tensor(self.target.iloc[idx]).float()
    return sample, label

In [6]:
train_dataset = CustomDataset(main_train_input, main_train_target)
test_dataset = CustomDataset(main_test_input, main_test_target)

train_loader = DataLoader(train_dataset, batch_size=128)
test_loader = DataLoader(test_dataset, batch_size=128)

In [7]:
class NN(nn.Module):
  def __init__(self):
    # nn 모듈을 사용하는 경우에는 모두 init안에 정의를 해 주어야 한다.
    
    # 신경망의 구조
    # 여러분의 아이디어가 필요한 부분입니다!
    # 어쩌면 선형 모델을 이용한건 0.72가 최선일 수 있습니다.
    # 조금 더 시도해 보다가 안된다면, 그냥 Ensemble Learning의 1개의 요소로만 쓰고, 다른 모델에 더 투자해 보죠!
    
    # 고려할 부분
    # 신경망의 개수
    # 줄어드는 차원의 방식
    # dropout
    super(NN, self).__init__()
    self.fc1 = nn.Linear(in_features=26, out_features=20)
    self.fc2 = nn.Linear(in_features=20, out_features=16)
    self.fc3 = nn.Linear(in_features=16, out_features=8)
    self.fc4 = nn.Linear(in_features=8, out_features=4)
    self.fc5 = nn.Linear(in_features=4, out_features=1)
    self.drop = nn.Dropout(0.3)

  def forward(self, input_data):
    out = F.relu(self.fc1(input_data))
    out = self.drop(out)
    out = F.relu(self.fc2(out))
    out = F.relu(self.fc3(out))
    out = F.relu(self.fc4(out))
    out = torch.sigmoid(self.fc5(out))
    return out

# 모델 객체 생성
model_main = NN().to(device)
print(model_main)

NN(
  (fc1): Linear(in_features=26, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=8, bias=True)
  (fc4): Linear(in_features=8, out_features=4, bias=True)
  (fc5): Linear(in_features=4, out_features=1, bias=True)
  (drop): Dropout(p=0.3, inplace=False)
)


In [8]:
# loss = Binary Cross Entropy Loss
# optimizer = adam

learning_rate = 0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model_main.parameters(), lr=learning_rate)

In [9]:
# epoch는 200으로 한다.
epochs = 200
aggregated_losses = []
for i in range(epochs):
    for x, y in train_loader:
        x = x.to(device); y = y.to(device)
        output = model_main(x).view(-1)
        loss = criterion(output, y)
        
        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [10]:
# 모델 평가 코드
! pip install torchmetrics
import torchmetrics

metrics = torchmetrics.Accuracy()

predict = torch.round(model_main(torch.Tensor(main_test_input.to_numpy()).to(device)))
target = torch.tensor(main_test_target.to_numpy())

acc = torchmetrics.functional.accuracy(predict, target.to(device))
print(acc.item())

0.7193037867546082


In [11]:
# 제출용 csv 파일 만들기
test_data = pd.read_csv('test.csv')
test_data_rpm = test_data.fillna(test_data.mean())
idx1to26_rpm = test_data_rpm.iloc[:, 1:27]

submission = pd.read_csv('sample_submission.csv')
predict = torch.round(model_main(torch.Tensor(idx1to26_rpm.to_numpy()).to(device)))
submission["nerdiness"] = predict.detach().numpy().astype(np.int32)

submission.to_csv("submission1.csv", index = False)

  test_data_rpm = test_data.fillna(test_data.mean())
