# 라이브러리 불러오기

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
import math
from tqdm import tqdm
import torch
from torch import nn, optim                         
from torch.utils.data import Dataset, DataLoader     
from torch.optim.lr_scheduler import _LRScheduler
from sklearn.preprocessing import MinMaxScaler
from MLP import TrainDataset, MLP, train, evaluation, CosineAnnealingWarmUpRestarts

# 랜덤 시드 설정

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# 데이터 불러오기

In [3]:
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

# 대회에 필요한 데이터만 활용하기

In [4]:
columns_needed = ['area_m2', 'contract_year_month', 'contract_day', 'contract_type', 'floor', 'latitude', 'longitude', 'deposit']
columns_needed_test = ['area_m2', 'contract_year_month', 'contract_day', 'contract_type', 'floor', 'latitude', 'longitude']
train_data = train_data[columns_needed]
test_data = test_data[columns_needed_test]

# Holdout 데이터셋 설정 (예: 2023년 7월부터 12월까지의 데이터)

In [5]:
holdout_start = 202307
holdout_end = 202312
holdout_data = train_data[(train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end)]
train_data = train_data[~((train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end))]

# 학습 데이터와 정답 데이터 분리

In [6]:
X_train = train_data.drop(columns=['deposit']).to_numpy()
y_train = train_data['deposit'].to_numpy().reshape((-1,1))
X_holdout = holdout_data.drop(columns=['deposit']).to_numpy()
y_holdout = holdout_data['deposit'].to_numpy().reshape((-1,1))
X_test = test_data.copy().to_numpy()

In [7]:
# 스케일링 
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_holdout = scaler.transform(X_holdout)
X_test = scaler.transform(X_test)

# MLP 모델 훈련

In [8]:
batch_size = 256

trainsets = TrainDataset(X_train, y_train)
trainloader = DataLoader(trainsets, batch_size=batch_size, shuffle=True)

validsets = TrainDataset(X_holdout, y_holdout)
validloader = DataLoader(validsets, batch_size=batch_size, shuffle=False)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = MLP(X_train.shape[1])
criterion = nn.L1Loss(reduction='sum')
optimizer = optim.Adam(model.parameters(), lr = 0)
scheduler = CosineAnnealingWarmUpRestarts(optimizer, T_0=50, T_mult=1, eta_max=0.1,  T_up=10, gamma=0.5)
epochs = 140

cuda


In [None]:
checkpoint_name = 'MLP-baseline'
train(model, trainloader, validloader, criterion, optimizer, epochs, device, checkpoint_name, scheduler)

# Holdout 데이터셋에 대한 성능 확인

In [None]:
mlp_model = MLP(X_train.shape[1])
mlp_model.load_state_dict(torch.load(f'checkpoint/{checkpoint_name}_parameters.pt'))

In [None]:
mlp_holdout_pred = mlp_model(X_holdout)
mlp_holdout_mae = mean_absolute_error(y_holdout, mlp_holdout_pred)
print("Holdout 데이터셋 성능:")
print(f"MLP MAE: {mlp_holdout_mae:.2f}")

# Sample Submission 제출하기

In [27]:
X_test_tensor = torch.FloatTensor(X_test)
X_test_tensor = X_test_tensor.to(device)
mlp_model.to(device)
mlp_model.eval()
with torch.no_grad():
    mlp_test_pred = mlp_model(X_test_tensor)

In [None]:
mlp_test_pred = mlp_test_pred.flatten().tolist()
sample_submission['deposit'] = mlp_test_pred
sample_submission.to_csv(f'output/sample_submission_output.csv', index=False, encoding='utf-8-sig')