In [1]:
import os
import glob
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# -----------------------------
# 설정
# -----------------------------
TRACK_DIR = '/content/drive/MyDrive/25년 해군 AI 경진대회/nk_dataset/mmsi_tracks'
MAPPING_FILE = '/content/drive/MyDrive/25년 해군 AI 경진대회/nk_dataset/Anonymized_MMSI_list-매핑용.csv'
VALID_FILE = '/content/drive/MyDrive/25년 해군 AI 경진대회/2번문제/Anonymized_MMSI_list_validation(2번문제)_수정.csv'
OUTPUT_FILE = 'submission_nkmodel2.csv'

SEQ_LEN = 30           # 시퀀스 길이
STEP = 5               # 윈도우 슬라이드 간격
RESAMPLE_FREQ = '10s'   # 리샘플링 주기
MAX_SEQ_PER_VESSEL = 100
LATENT_DIM = 16
BATCH_SIZE = 128
EPOCHS = 20
LR = 1e-3
PATIENCE = 3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
# -----------------------------
# 매핑 로드
# -----------------------------
mapping_df = pd.read_csv(MAPPING_FILE)
mapping = dict(zip(mapping_df['Anonymized_MMSI'].astype(str), mapping_df['MMSI'].astype(str)))
files = glob.glob(os.path.join(TRACK_DIR, '*.csv'))
anon_ids = [os.path.splitext(os.path.basename(f))[0] for f in files]

In [3]:
# -----------------------------
# Dataset 정의
# -----------------------------
class TrackSeqDataset(Dataset):
    def __init__(self, files, seq_len, step, resample_freq, max_per_id):
        self.seq_len = seq_len
        self.step = step
        self.sequences = []
        self.seq_ids = []
        # 1) 모든 선박 raw 데이터 로드
        raw_data = {}
        for f in files:
            aid = os.path.splitext(os.path.basename(f))[0]
            df = pd.read_csv(f, parse_dates=['timestamp'])
            df = df.sort_values('timestamp').set_index('timestamp')
            df = df[['sog','cog','latitude','longitude']]
            df = df.resample(resample_freq).mean().interpolate().dropna()
            raw_data[aid] = df.values
        # 2) 글로벌 스케일러 학습
        concat = np.vstack(list(raw_data.values()))
        self.scaler = StandardScaler().fit(concat)
        # 3) 시퀀스 생성 및 샘플링
        for aid, arr in raw_data.items():
            normed = self.scaler.transform(arr)
            seqs = []
            for i in range(0, len(normed) - seq_len + 1, step):
                seqs.append(normed[i:i+seq_len])
            if len(seqs) > max_per_id:
                seqs = random.sample(seqs, max_per_id)
            for s in seqs:
                self.sequences.append(s)
                self.seq_ids.append(aid)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.float32), self.seq_ids[idx]

In [4]:
# -----------------------------
# GRU Autoencoder
# -----------------------------
class GRUAutoencoder(nn.Module):
    def __init__(self, seq_len, n_feat, latent_dim):
        super().__init__()
        self.encoder = nn.GRU(n_feat, latent_dim, batch_first=True)
        self.decoder = nn.GRU(latent_dim, n_feat, batch_first=True)
    def forward(self, x):
        out, h = self.encoder(x)
        latent = h.repeat(x.size(1), 1, 1).permute(1,0,2)
        recon, _ = self.decoder(latent)
        return recon

In [5]:
# -----------------------------
# 학습 및 조기 종료
# -----------------------------
def train_model(dataset):
    # Train/Val 분리
    val_size = int(0.2 * len(dataset))
    train_size = len(dataset) - val_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                              num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                            num_workers=4, pin_memory=True)

    model = GRUAutoencoder(SEQ_LEN, 4, LATENT_DIM).to(DEVICE)
    optim = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.MSELoss()
    best_val = float('inf')
    patience_cnt = 0

    for epoch in range(1, EPOCHS+1):
        model.train()
        train_loss = 0
        for seqs, _ in train_loader:
            seqs = seqs.to(DEVICE)
            optim.zero_grad()
            recon = model(seqs)
            loss = criterion(recon, seqs)
            loss.backward()
            optim.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for seqs, _ in val_loader:
                seqs = seqs.to(DEVICE)
                recon = model(seqs)
                val_loss += criterion(recon, seqs).item()
        val_loss /= len(val_loader)

        print(f"Epoch {epoch}: Train Loss={train_loss:.6f}, Val Loss={val_loss:.6f}")
        # Early Stopping
        if val_loss < best_val:
            best_val = val_loss
            torch.save(model.state_dict(), 'best_gru_ae.pth')
            patience_cnt = 0
        else:
            patience_cnt += 1
            if patience_cnt >= PATIENCE:
                print("Early stopping...")
                break
    model.load_state_dict(torch.load('best_gru_ae.pth'))
    return model, dataset.scaler

In [6]:
# -----------------------------
# 재구성 오차 계산
# -----------------------------
def compute_errors(model, dataset):
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False,
                        num_workers=4, pin_memory=True)
    model.eval()
    errs = {}
    with torch.no_grad():
        for seqs, aids in loader:
            seqs = seqs.to(DEVICE)
            recon = model(seqs)
            batch_err = ((recon - seqs)**2).mean(dim=(1,2)).cpu().numpy()
            for e, aid in zip(batch_err, aids):
                errs.setdefault(aid, []).append(e)
    return {aid: np.mean(v) for aid,v in errs.items()}

In [7]:
# -----------------------------
# 이상치 탐지 및 예측 저장
# -----------------------------
def predict_and_save(model, scaler, threshold):
    results = []
    val_df = pd.read_csv(VALID_FILE)
    for aid in val_df['Anonymized_MMSI'].astype(str):
        f = os.path.join(TRACK_DIR, f"{aid}.csv")
        if not os.path.exists(f):
            pred = False
        else:
            df = pd.read_csv(f, parse_dates=['timestamp']).sort_values('timestamp').set_index('timestamp')
            arr = df[['sog','cog','latitude','longitude']]
            arr = arr.resample(RESAMPLE_FREQ).mean().interpolate().dropna().values
            normed = scaler.transform(arr)
            seqs = [normed[i:i+SEQ_LEN] for i in range(0, len(normed)-SEQ_LEN+1, STEP)]
            if not seqs:
                pred = False
            else:
                seqs_t = torch.tensor(np.stack(seqs), dtype=torch.float32).to(DEVICE)
                with torch.no_grad():
                    recon = model(seqs_t)
                errs = ((recon - seqs_t)**2).mean(dim=(1,2)).cpu().numpy()
                pred = errs.mean() > threshold
        results.append({'Anonymized_MMSI': aid, 'result': 'TRUE' if pred else 'FALSE'})
    pd.DataFrame(results).to_csv(OUTPUT_FILE, index=False)
    print(f"Saved submission to {OUTPUT_FILE}")

In [None]:
# -----------------------------
# Main
# -----------------------------
if __name__ == '__main__':
    dataset = TrackSeqDataset(files, SEQ_LEN, STEP, RESAMPLE_FREQ, MAX_SEQ_PER_VESSEL)
    model, scaler = train_model(dataset)
    errs = compute_errors(model, dataset)
    thr = np.percentile(list(errs.values()), 95)
    print(f"Threshold: {thr:.6f}")
    predict_and_save(model, scaler, thr)

  df = pd.read_csv(f, parse_dates=['timestamp'])
