In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
import numpy as np
import random

def set_seed(val):
    torch.manual_seed(val)
    torch.cuda.manual_seed(val)
    torch.cuda.manual_seed_all(val)
    np.random.seed(val)
    random.seed(val)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(42)

In [3]:
# Load data
merged_data = pd.read_csv('../data/merged_data.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

merged_data['contract_date'] = pd.to_datetime(merged_data['contract_year_month'].astype(str) + merged_data['contract_day'].astype(str), format='%Y%m%d')
merged_data['contract_date'] = (merged_data['contract_date'] - pd.Timestamp('2019-01-01')).dt.days

In [4]:
train_data = merged_data[merged_data['_type'] == 'train'].drop(columns=['_type'])
test_data = merged_data[merged_data['_type'] == 'test'].drop(columns=['_type', 'deposit'])

In [5]:
class TrainDataset(Dataset):
    def __init__(self, data, scaler=None):
        self.data = data
        self.X = self.data.drop(columns=['deposit']).values
        self.y = self.data['deposit'].values
        self.scaler = scaler
        
        # batch_normalization으로 대체
        # if scaler is None:
        #     self.scaler = StandardScaler()
        #     self.X = self.scaler.fit_transform(self.X)
        # else:
        #     self.scaler = scaler
        #     self.X = self.scaler.transform(self.X)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)
    
class TestDataset(Dataset):
    def __init__(self, data, scaler):
        self.data = data
        self.X = self.data.values
        
        # batch_normalization으로 대체
        # self.scaler = scaler
        # self.X = self.scaler.transform(self.X)
    
    def __len__(self):  
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32)

In [6]:
columns = ['latitude', 'longitude', 'cluster_kmeans', 'contract_date', 'deposit_mean', 'interest_rate', 'area_m2', 'floor', 'built_year', 'age', 'nearest_subway_distance_km', 'contract_type', ]
train_data = train_data[columns + ['deposit']]
test_data = test_data[columns]

In [7]:
train_data.head()

Unnamed: 0,latitude,longitude,cluster_kmeans,contract_date,deposit_mean,interest_rate,area_m2,floor,built_year,age,nearest_subway_distance_km,contract_type,deposit
0,37.054314,127.045216,0.0,175,31111.287554,1.92,84.9981,9,2019,0,0.716953,2,17000.0
1,37.054314,127.045216,0.0,450,32148.24631,1.63,84.9981,20,2019,1,0.716953,2,23000.0
2,37.054314,127.045216,0.0,452,32148.24631,1.63,84.9981,8,2019,1,0.716953,2,23000.0
3,36.964647,127.055847,1.0,195,31621.427691,1.94,59.34,1,1986,33,3.89728,2,5000.0
4,36.97239,127.084514,1.0,101,30447.425958,2.04,59.81,6,1995,24,2.039685,2,1800.0


In [8]:
from tqdm import tqdm
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

train_dataset = TrainDataset(train_data)
val_dataset = TrainDataset(val_data, scaler=train_dataset.scaler)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False)

# Define model
class Model(nn.Module):
    def __init__(self, input_size, location_feat_size, time_feat_size, location_emb_size=8, time_emb_size=4, num_heads=4):
        super().__init__()
        # 위치 정보를 embedding
        self.location_feat_size = location_feat_size
        self.time_feat_size = time_feat_size
        self.location_emb_size = location_emb_size
        self.time_emb_size = time_emb_size
        
        self.location_embedding = nn.Linear(location_feat_size, location_emb_size) 
        self.time_embedding = nn.Linear(time_feat_size, time_emb_size)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=(location_emb_size + time_emb_size), nhead=num_heads)
        
        # 시간 + 위치 정보를 이용한 trend
        self.fc_trend = nn.Sequential(
            nn.BatchNorm1d(location_emb_size + time_emb_size),
            nn.TransformerEncoder(self.encoder_layer, 2),
            nn.Linear(location_emb_size + time_emb_size, 1)
        )
        
        # 전체 feature를 이용한 residual
        self.fc_residual = nn.Sequential( 
            nn.BatchNorm1d(input_size),
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )
        self.out_layer = nn.Linear(2, 1)
        
    def forward(self, x):
        # location은 1 ~ 4번째 feature
        location = x[:, :self.location_feat_size]
        location = self.location_embedding(location)
        
        time = x[:, self.location_feat_size:self.location_feat_size + self.time_feat_size]
        time = self.time_embedding(time)
        
        trend = torch.cat([location.view(-1, self.location_emb_size), time.view(-1, self.time_emb_size)], dim=1)
        trend = self.fc_trend(trend)
        
        residual = x[:, :]
        residual = self.fc_residual(residual)
        return self.out_layer(torch.cat([trend, residual], dim=1))
        
        
    

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = Model(train_dataset.X.shape[1], 3, 3).to(device)
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

cuda




In [9]:
def train_model(model, train_loader, val_loader, criterion, optimizer, device, n_epochs, patience):
    best_val_loss = float('inf')
    counter = 0
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.2)
    for epoch in range(n_epochs):
        train_losses = []
        for data in tqdm(train_loader):
            optimizer.zero_grad()
            input_data, target = data
            input_data = input_data.to(device)
            target = target.to(device)
            input_data.to(device)
            output = model(input_data)
            loss = criterion(output, target.view(-1, 1))
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())

        # Evaluate model
        model.eval()

        val_losses = []
        val_preds = []

        with torch.no_grad():
            for i, data in enumerate(val_loader):
                input_data, target = data
                input_data = input_data.to(device)
                target = target.to(device)
                output = model(input_data)
                # MAE Loss
                loss = criterion(output, target.view(-1, 1))
                val_losses.append(loss.item())

        val_loss = sum(val_losses) / len(val_losses)
        print(f'Epoch {epoch}, Train Loss: {sum(train_losses) / len(train_losses)}, Val Loss: {val_loss}')
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print('Early stopping')
                break
        scheduler.step()
    
    return model

model.train()
model = train_model(model, train_loader, val_loader, criterion, optimizer, device, 100, 10)

100%|██████████| 5629/5629 [01:02<00:00, 90.10it/s] 


Epoch 0, Train Loss: 8196.18968507811, Val Loss: 8494.43469689109


100%|██████████| 5629/5629 [00:58<00:00, 95.97it/s] 


Epoch 1, Train Loss: 6630.5750952968165, Val Loss: 6603.5643286271525


100%|██████████| 5629/5629 [00:59<00:00, 94.72it/s] 


Epoch 2, Train Loss: 6210.182074864819, Val Loss: 5893.8629126115275


100%|██████████| 5629/5629 [00:58<00:00, 97.00it/s] 


Epoch 3, Train Loss: 5971.465205905662, Val Loss: 5723.914515755393


100%|██████████| 5629/5629 [00:57<00:00, 98.33it/s] 


Epoch 4, Train Loss: 5844.044268780395, Val Loss: 6134.9350901517


100%|██████████| 5629/5629 [00:58<00:00, 96.78it/s] 


Epoch 5, Train Loss: 5951.059501654726, Val Loss: 6061.31996605613


100%|██████████| 5629/5629 [00:58<00:00, 96.72it/s] 


Epoch 6, Train Loss: 5952.458088397556, Val Loss: 5655.932946985418


100%|██████████| 5629/5629 [00:59<00:00, 95.09it/s] 


Epoch 7, Train Loss: 5777.852764162845, Val Loss: 5687.5625624223185


100%|██████████| 5629/5629 [01:00<00:00, 93.38it/s] 


Epoch 8, Train Loss: 5634.677658040394, Val Loss: 5493.114978790283


100%|██████████| 5629/5629 [00:58<00:00, 96.48it/s] 


Epoch 9, Train Loss: 5596.574261861704, Val Loss: 5449.258501573043


100%|██████████| 5629/5629 [00:58<00:00, 96.75it/s] 


Epoch 10, Train Loss: 5193.162354686668, Val Loss: 5159.787808158181


100%|██████████| 5629/5629 [00:59<00:00, 94.93it/s] 


Epoch 11, Train Loss: 5153.374607093636, Val Loss: 5144.87651720914


100%|██████████| 5629/5629 [00:58<00:00, 95.59it/s] 


Epoch 12, Train Loss: 5130.955131819456, Val Loss: 5121.479549754749


100%|██████████| 5629/5629 [01:00<00:00, 92.38it/s] 


Epoch 13, Train Loss: 5114.345979404229, Val Loss: 5078.981213829734


100%|██████████| 5629/5629 [01:00<00:00, 93.48it/s] 


Epoch 14, Train Loss: 5101.330456935485, Val Loss: 5161.7626048001375


100%|██████████| 5629/5629 [00:56<00:00, 99.39it/s] 


Epoch 15, Train Loss: 5089.756331695377, Val Loss: 5122.78257595409


100%|██████████| 5629/5629 [00:57<00:00, 97.84it/s] 


Epoch 16, Train Loss: 5078.367513006376, Val Loss: 5027.623749472878


100%|██████████| 5629/5629 [00:57<00:00, 97.25it/s] 


Epoch 17, Train Loss: 5072.378507575168, Val Loss: 5061.655965111472


100%|██████████| 5629/5629 [00:58<00:00, 95.85it/s] 


Epoch 18, Train Loss: 5062.450348155201, Val Loss: 5030.614100542935


100%|██████████| 5629/5629 [00:59<00:00, 95.37it/s] 


Epoch 19, Train Loss: 5051.716404749331, Val Loss: 5037.456029545177


100%|██████████| 5629/5629 [00:57<00:00, 97.59it/s] 


Epoch 20, Train Loss: 4955.815617791584, Val Loss: 4982.010883504694


100%|██████████| 5629/5629 [00:57<00:00, 98.68it/s] 


Epoch 21, Train Loss: 4948.974578060463, Val Loss: 4957.41301276467


100%|██████████| 5629/5629 [00:57<00:00, 97.47it/s] 


Epoch 22, Train Loss: 4945.52230362749, Val Loss: 4950.281240463257


100%|██████████| 5629/5629 [00:56<00:00, 99.75it/s] 


Epoch 23, Train Loss: 4943.490449326312, Val Loss: 4950.6314300190315


100%|██████████| 5629/5629 [00:59<00:00, 94.60it/s] 


Epoch 24, Train Loss: 4940.018096246142, Val Loss: 4952.120396180587


100%|██████████| 5629/5629 [00:57<00:00, 97.38it/s] 


Epoch 25, Train Loss: 4939.847039891424, Val Loss: 4944.847608739679


100%|██████████| 5629/5629 [00:55<00:00, 101.35it/s]


Epoch 26, Train Loss: 4936.1618111062, Val Loss: 4938.983571486039


100%|██████████| 5629/5629 [00:58<00:00, 96.22it/s] 


Epoch 27, Train Loss: 4935.155651510662, Val Loss: 4964.644718343561


100%|██████████| 5629/5629 [00:56<00:00, 99.43it/s] 


Epoch 28, Train Loss: 4933.478637283279, Val Loss: 4945.5268870267


100%|██████████| 5629/5629 [00:58<00:00, 95.59it/s] 


Epoch 29, Train Loss: 4931.449315122441, Val Loss: 4953.796244361184


100%|██████████| 5629/5629 [00:57<00:00, 98.45it/s] 


Epoch 30, Train Loss: 4909.808480419132, Val Loss: 4927.731859207153


100%|██████████| 5629/5629 [00:56<00:00, 99.73it/s] 


Epoch 31, Train Loss: 4907.806970858917, Val Loss: 4924.344836495139


100%|██████████| 5629/5629 [00:56<00:00, 99.01it/s] 


Epoch 32, Train Loss: 4906.972236453036, Val Loss: 4923.690094687722


100%|██████████| 5629/5629 [00:56<00:00, 99.38it/s] 


Epoch 33, Train Loss: 4906.91544151069, Val Loss: 4923.242385344071


100%|██████████| 5629/5629 [00:56<00:00, 99.40it/s] 


Epoch 34, Train Loss: 4906.080199783279, Val Loss: 4923.340218110518


100%|██████████| 5629/5629 [00:56<00:00, 99.14it/s] 


Epoch 35, Train Loss: 4905.874412050042, Val Loss: 4926.039667129517


100%|██████████| 5629/5629 [00:56<00:00, 99.81it/s] 


Epoch 36, Train Loss: 4905.616796935721, Val Loss: 4923.098996075717


100%|██████████| 5629/5629 [00:57<00:00, 97.28it/s] 


Epoch 37, Train Loss: 4905.335985989824, Val Loss: 4926.409358804876


100%|██████████| 5629/5629 [00:56<00:00, 99.46it/s] 


Epoch 38, Train Loss: 4904.831395590913, Val Loss: 4922.737690838901


100%|██████████| 5629/5629 [00:56<00:00, 99.00it/s] 


Epoch 39, Train Loss: 4904.705868578525, Val Loss: 4921.155223152854


100%|██████████| 5629/5629 [00:56<00:00, 100.52it/s]


Epoch 40, Train Loss: 4899.521230519062, Val Loss: 4919.068464279175


100%|██████████| 5629/5629 [00:56<00:00, 99.54it/s] 


Epoch 41, Train Loss: 4899.096456191916, Val Loss: 4919.099783463912


100%|██████████| 5629/5629 [00:56<00:00, 99.86it/s] 


Epoch 42, Train Loss: 4898.982111418685, Val Loss: 4920.017231507735


100%|██████████| 5629/5629 [00:56<00:00, 99.57it/s] 


Epoch 43, Train Loss: 4898.8069596255855, Val Loss: 4919.522745999423


100%|██████████| 5629/5629 [00:56<00:00, 98.97it/s] 


Epoch 44, Train Loss: 4898.865471966463, Val Loss: 4918.965304808183


100%|██████████| 5629/5629 [01:00<00:00, 92.33it/s] 


Epoch 45, Train Loss: 4898.7482272155075, Val Loss: 4919.037095156583


100%|██████████| 5629/5629 [00:57<00:00, 98.53it/s] 


Epoch 46, Train Loss: 4898.539885265616, Val Loss: 4918.802045475353


100%|██████████| 5629/5629 [00:56<00:00, 99.35it/s] 


Epoch 47, Train Loss: 4898.535324619854, Val Loss: 4918.590598886663


100%|██████████| 5629/5629 [00:55<00:00, 100.68it/s]


Epoch 48, Train Loss: 4898.429060168325, Val Loss: 4918.947677612305


100%|██████████| 5629/5629 [00:56<00:00, 100.26it/s]


Epoch 49, Train Loss: 4898.396862621649, Val Loss: 4918.466516668146


100%|██████████| 5629/5629 [00:56<00:00, 100.22it/s]


Epoch 50, Train Loss: 4897.279394852202, Val Loss: 4918.253587375988


100%|██████████| 5629/5629 [00:56<00:00, 99.78it/s] 


Epoch 51, Train Loss: 4897.1783958403885, Val Loss: 4918.433774081143


100%|██████████| 5629/5629 [00:56<00:00, 99.18it/s] 


Epoch 52, Train Loss: 4897.174376866728, Val Loss: 4918.34296243841


100%|██████████| 5629/5629 [00:56<00:00, 99.47it/s] 


Epoch 53, Train Loss: 4897.10683991037, Val Loss: 4918.244430368597


100%|██████████| 5629/5629 [00:57<00:00, 97.58it/s] 


Epoch 54, Train Loss: 4897.129916555866, Val Loss: 4918.238059650768


100%|██████████| 5629/5629 [00:56<00:00, 99.32it/s] 


Epoch 55, Train Loss: 4897.111518007334, Val Loss: 4918.201437169855


100%|██████████| 5629/5629 [00:55<00:00, 100.60it/s]


Epoch 56, Train Loss: 4897.0600896914275, Val Loss: 4918.197960246693


100%|██████████| 5629/5629 [00:56<00:00, 98.77it/s] 


Epoch 57, Train Loss: 4897.031394732149, Val Loss: 4918.161724263971


100%|██████████| 5629/5629 [00:56<00:00, 99.23it/s] 


Epoch 58, Train Loss: 4897.0240516466065, Val Loss: 4918.305554996838


100%|██████████| 5629/5629 [00:56<00:00, 99.40it/s] 


Epoch 59, Train Loss: 4897.081766984797, Val Loss: 4918.16259765625


100%|██████████| 5629/5629 [00:58<00:00, 96.02it/s] 


Epoch 60, Train Loss: 4896.751552845407, Val Loss: 4918.21984204379


100%|██████████| 5629/5629 [00:56<00:00, 99.66it/s] 


Epoch 61, Train Loss: 4896.757765528194, Val Loss: 4918.24247637662


100%|██████████| 5629/5629 [00:57<00:00, 98.12it/s] 


Epoch 62, Train Loss: 4896.760639829732, Val Loss: 4918.184198206121


100%|██████████| 5629/5629 [00:57<00:00, 98.70it/s] 


Epoch 63, Train Loss: 4896.750073341943, Val Loss: 4918.163158069958


100%|██████████| 5629/5629 [00:57<00:00, 98.66it/s] 


Epoch 64, Train Loss: 4896.766645801874, Val Loss: 4918.230917497115


100%|██████████| 5629/5629 [00:57<00:00, 98.29it/s] 


Epoch 65, Train Loss: 4896.724766251291, Val Loss: 4918.210935592651


100%|██████████| 5629/5629 [00:56<00:00, 99.00it/s] 


Epoch 66, Train Loss: 4896.7044607688695, Val Loss: 4918.157781774347


100%|██████████| 5629/5629 [00:57<00:00, 98.50it/s] 


Epoch 67, Train Loss: 4896.730863521357, Val Loss: 4918.156154459173


100%|██████████| 5629/5629 [00:56<00:00, 99.00it/s] 


Epoch 68, Train Loss: 4896.730333472933, Val Loss: 4918.145912690597


100%|██████████| 5629/5629 [00:56<00:00, 99.38it/s] 


Epoch 69, Train Loss: 4896.744998305025, Val Loss: 4918.165717731823


100%|██████████| 5629/5629 [00:56<00:00, 99.09it/s] 


Epoch 70, Train Loss: 4896.661982468717, Val Loss: 4918.158741690896


100%|██████████| 5629/5629 [00:58<00:00, 96.48it/s] 


Epoch 71, Train Loss: 4896.658628516943, Val Loss: 4918.157067905773


100%|██████████| 5629/5629 [00:56<00:00, 99.40it/s] 


Epoch 72, Train Loss: 4896.650227806754, Val Loss: 4918.163762179288


100%|██████████| 5629/5629 [00:56<00:00, 100.21it/s]


Epoch 73, Train Loss: 4896.639027438127, Val Loss: 4918.146807237105


100%|██████████| 5629/5629 [00:57<00:00, 98.42it/s] 


Epoch 74, Train Loss: 4896.667969660811, Val Loss: 4918.15793835033


100%|██████████| 5629/5629 [00:57<00:00, 98.51it/s] 


Epoch 75, Train Loss: 4896.645421589093, Val Loss: 4918.174746600064


100%|██████████| 5629/5629 [00:57<00:00, 98.68it/s] 


Epoch 76, Train Loss: 4896.6698071128585, Val Loss: 4918.148500095715


100%|██████████| 5629/5629 [00:56<00:00, 98.89it/s] 


Epoch 77, Train Loss: 4896.692788687073, Val Loss: 4918.1515698866415


100%|██████████| 5629/5629 [00:57<00:00, 98.35it/s] 


Epoch 78, Train Loss: 4896.660907668783, Val Loss: 4918.144081462513


100%|██████████| 5629/5629 [00:56<00:00, 100.12it/s]


Epoch 79, Train Loss: 4896.69290761292, Val Loss: 4918.149802988226


100%|██████████| 5629/5629 [00:56<00:00, 100.01it/s]


Epoch 80, Train Loss: 4896.625745867175, Val Loss: 4918.149498852817


100%|██████████| 5629/5629 [00:57<00:00, 98.34it/s] 


Epoch 81, Train Loss: 4896.638987275715, Val Loss: 4918.148633089932


100%|██████████| 5629/5629 [00:59<00:00, 93.88it/s] 


Epoch 82, Train Loss: 4896.639411886964, Val Loss: 4918.149257659912


100%|██████████| 5629/5629 [00:59<00:00, 95.19it/s] 


Epoch 83, Train Loss: 4896.674630809411, Val Loss: 4918.1485467390585


100%|██████████| 5629/5629 [00:56<00:00, 98.83it/s] 


Epoch 84, Train Loss: 4896.653122189498, Val Loss: 4918.149154663086


100%|██████████| 5629/5629 [00:57<00:00, 98.26it/s] 


Epoch 85, Train Loss: 4896.651212913525, Val Loss: 4918.149893153797


100%|██████████| 5629/5629 [00:57<00:00, 98.42it/s] 


Epoch 86, Train Loss: 4896.655882596334, Val Loss: 4918.1494451002645


100%|██████████| 5629/5629 [00:56<00:00, 98.76it/s] 


Epoch 87, Train Loss: 4896.667454315473, Val Loss: 4918.149345571344


100%|██████████| 5629/5629 [00:59<00:00, 94.78it/s] 


Epoch 88, Train Loss: 4896.674307645119, Val Loss: 4918.150370164351
Early stopping


In [12]:
torch.save(model.state_dict(), './checkpoint/custom_mlp.pth')

In [10]:
checkpoint = torch.load('./checkpoint/custom_mlp.pth')
model.load_state_dict(checkpoint)

# Test model
test_dataset = TestDataset(test_data, None)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

model.eval()
test_preds = []

with torch.no_grad():
    for i, data in enumerate(test_loader):
        input_data = data
        input_data = input_data.to(device)
        output = model(input_data)
        test_preds.append(output.cpu().numpy())
        
test_preds = np.concatenate(test_preds).flatten()

In [22]:
sample_submission['deposit'] = test_preds
sample_submission['deposit'] = sample_submission['deposit'].apply(lambda x: 1000 if x < 0 else x)
sample_submission.to_csv('mlp_submission.csv', index=False)

In [23]:
sample_submission.describe()

Unnamed: 0,index,deposit
count,150172.0,150172.0
mean,75085.5,41654.463958
std,43351.06665,25865.791051
min,0.0,695.305908
25%,37542.75,24341.23877
50%,75085.5,35964.191406
75%,112628.25,52266.271484
max,150171.0,715633.375
