##### 베이스라인은 참고용 코드 입니다. 여러분 만의 고도화된 모델을 만들어 주세요

## Import

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [3]:
CFG = {
    'EPOCHS':10,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':128,
    'SEED':41
}

### 데이터 불러오기

In [4]:
train_data = pd.read_csv("data/train_data.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [5]:
# train data는 134개의 터빈의 1~200일 동안의 데이터를 포함하고 있습니다.

In [5]:
train_data.head(3)

Unnamed: 0,TurbID,Day,Tmstamp,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv
0,1,1,00:00,,,,,,,,,,
1,1,1,00:10,6.17,-3.99,30.73,41.8,25.92,1.0,1.0,1.0,-0.25,494.66
2,1,1,00:20,6.27,-2.18,30.6,41.63,20.91,1.0,1.0,1.0,-0.24,509.76


In [6]:
train_data.tail(3)

Unnamed: 0,TurbID,Day,Tmstamp,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv
3859197,134,200,23:30,4.23,-0.08,8.74,10.22,172.35,0.01,0.01,0.01,-111.2,220.6
3859198,134,200,23:40,4.36,-6.12,8.64,10.1,172.35,0.01,0.01,0.01,-108.04,222.15
3859199,134,200,23:50,4.79,-3.65,8.53,10.01,168.58,0.01,0.01,0.01,-119.76,264.17


In [8]:
# 결론적으로 134개 터빈의 201~202일, 즉 2일간의 Patv값을 예측해야 합니다.

In [7]:
sample_submission.head(3)

Unnamed: 0,index,TurbID,Day,Tmstamp,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv
0,0,1,201,00:00,0,0,0,0,0,0,0,0,0,9999
1,1,1,201,00:10,0,0,0,0,0,0,0,0,0,9999
2,2,1,201,00:20,0,0,0,0,0,0,0,0,0,9999


### 데이터 전처리

In [8]:
# train_data의 결측값 확인
train_data.isnull().sum()

TurbID         0
Day            0
Tmstamp        0
Wspd       45587
Wdir       45587
Etmp       45587
Itmp       45587
Ndir       45587
Pab1       45587
Pab2       45587
Pab3       45587
Prtv       45587
Patv       45587
dtype: int64

In [9]:
# 결측값 처리
train_data = train_data.fillna(method = 'bfill')

print(train_data.isnull().sum())

TurbID     0
Day        0
Tmstamp    0
Wspd       0
Wdir       0
Etmp       0
Itmp       0
Ndir       0
Pab1       0
Pab2       0
Pab3       0
Prtv       0
Patv       0
dtype: int64


In [10]:
def make_train_data(data):
    train_x, train_y = [], []
    for i in tqdm(sorted(pd.unique(data["TurbID"]))):
        tmp_data = data[data["TurbID"] == i]
        for j in range(1, 201 - 6):
            
            # train data ==> 5일 단위
            # label data ==> 2일 단위
            day_list = [x for x in range(j, j+ 5)]
            label_day_list = [y for y in range(j+5, j + 7)]
            
            train_tmp = tmp_data[tmp_data["Day"].isin(day_list)]
            label_tmp = tmp_data[tmp_data["Day"].isin(label_day_list)]["Patv"]
            
            # feature 선택 및 제거
            train_tmp = train_tmp.drop(["TurbID", "Day"], axis = 1)
            
            train_x.append(np.array(train_tmp))
            train_y.append(np.array(label_tmp))
            
    return train_x, train_y

In [11]:
# Feature 중 Tmstamp 정수로 변환
tms_list = list(pd.unique(train_data["Tmstamp"]))

train_data["Tmstamp"] = train_data["Tmstamp"].apply(lambda x : tms_list.index(x) + 1)

train_x, train_y = make_train_data(train_data)

100%|████████████████████████████████████████████████████████████████████████████████| 134/134 [00:23<00:00,  5.73it/s]


In [12]:
train_x = np.array(train_x).reshape(-1, train_x[0].shape[0], train_x[0].shape[1])
train_y = np.array(train_y)

In [13]:
train_x.shape, train_y.shape

((25996, 720, 11), (25996, 288))

### Custom Dataset

In [14]:
class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __getitem__(self, index):
        if self.Y is not None:
            return torch.Tensor(self.X[index]), torch.Tensor(self.Y[index])
        return torch.Tensor(self.X[index])
    
    def __len__(self):
        return len(self.X)

In [15]:
train_dataset = CustomDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

### 모델 선언

In [16]:
class BaseModel(nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
        self.gru = nn.GRU(input_size=11, hidden_size=256, batch_first=True, bidirectional=False)
        self.classifier = nn.Sequential(
            nn.Linear(256,516),
            nn.ReLU(),
            nn.Linear(516,288),
            nn.ReLU()
        )
        
    def forward(self, x):
        hidden, _ = self.gru(x)
        output = self.classifier(hidden[:,-1,:])
        return output

### 모델 학습

In [18]:
def train(model, optimizer, train_loader, device):
    model.to(device)
    criterion = nn.MSELoss().to(device)
    metric = nn.L1Loss().to(device)
    best_mae = 9999999
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        train_mae = []
        for X, Y in tqdm(iter(train_loader)):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            
            output = model(X)
            loss = criterion(output, Y)
            with torch.no_grad():
                mae = metric(output, Y)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            train_mae.append(mae.item())
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Train MAE : [{np.mean(train_mae):.5f}]')
        
        if best_mae > np.mean(train_mae):
            best_mae = np.mean(train_mae)
            torch.save(model.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
            print('Model Saved.')

In [19]:
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
train(model, optimizer, train_loader, device)

100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:08<00:00, 23.76it/s]


Epoch : [1] Train Loss : [200248.41690] Train MAE : [339.42891]
Model Saved.


100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:07<00:00, 26.38it/s]


Epoch : [2] Train Loss : [167606.88925] Train MAE : [322.96548]
Model Saved.


100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:07<00:00, 26.54it/s]


Epoch : [3] Train Loss : [158414.08333] Train MAE : [311.89384]
Model Saved.


100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:07<00:00, 26.35it/s]


Epoch : [4] Train Loss : [150656.22128] Train MAE : [302.14064]
Model Saved.


100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:07<00:00, 26.44it/s]


Epoch : [5] Train Loss : [142849.72557] Train MAE : [291.87270]
Model Saved.


100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:07<00:00, 26.33it/s]


Epoch : [6] Train Loss : [136940.76225] Train MAE : [284.33144]
Model Saved.


100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:07<00:00, 26.20it/s]


Epoch : [7] Train Loss : [131553.04148] Train MAE : [276.79657]
Model Saved.


100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:07<00:00, 26.10it/s]


Epoch : [8] Train Loss : [127704.65958] Train MAE : [271.68076]
Model Saved.


100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:07<00:00, 25.67it/s]


Epoch : [9] Train Loss : [125673.76842] Train MAE : [268.74715]
Model Saved.


100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:07<00:00, 26.07it/s]

Epoch : [10] Train Loss : [122302.83517] Train MAE : [264.00558]
Model Saved.





### 201일, 202일 Patv 추론

In [26]:
test_data_list = [x for x in range(196, 201)]

test_data = train_data[train_data["Day"].isin(test_data_list)]

test_data = test_data.drop(["TurbID", "Day"], axis = 1)

test_data = np.array(test_data).reshape(-1, train_x[0].shape[0], train_x[0].shape[1])

In [27]:
test_data.shape

(134, 720, 11)

In [28]:
test_dataset = CustomDataset(test_data, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [29]:
def predict(model, test_loader, device):
    model.to(device)
    model.eval()
    preds = []
    with torch.no_grad():
        for X in tqdm(iter(test_loader)):
            X = X.to(device)
            
            pred = model(X)
            preds += pred.cpu().tolist()
    
    return np.array(preds)

### Model Load

In [30]:
model = BaseModel()
best_checkpoint = torch.load('./best_model.pth')
model.load_state_dict(best_checkpoint)
model.eval()

BaseModel(
  (gru): GRU(11, 256, batch_first=True)
  (classifier): Sequential(
    (0): Linear(in_features=256, out_features=516, bias=True)
    (1): ReLU()
    (2): Linear(in_features=516, out_features=288, bias=True)
    (3): ReLU()
  )
)

In [31]:
preds = predict(model, test_loader, device)
preds = preds.reshape(-1)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 79.01it/s]


### Submit

In [32]:
sample_submission["Patv"] = preds
sample_submission.to_csv("./submit.csv", index = False)