In [1]:
import os
import glob
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_recall_curve

# -----------------------------
# 설정
# -----------------------------
TRACK_DIR = '/content/drive/MyDrive/25년 해군 AI 경진대회/nk_dataset/mmsi_tracks'                        # AIS 항적 데이터 폴더
MAPPING_FILE = '/content/drive/MyDrive/25년 해군 AI 경진대회/nk_dataset/Anonymized_MMSI_list-매핑용.csv'  # MMSI 매핑 파일

SEQ_LEN = 50        # 시퀀스 길이
BATCH_SIZE = 64
LATENT_DIM = 32     # 잠재차원
EPOCHS = 30
LR = 1e-3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# -----------------------------
# 데이터 로드 및 매핑
# -----------------------------
mapping_df = pd.read_csv(MAPPING_FILE)
mapping = dict(zip(mapping_df['Anonymized_MMSI'].astype(str), mapping_df['MMSI'].astype(str)))
files = glob.glob(os.path.join(TRACK_DIR, '*.csv'))

In [3]:
# -----------------------------
# Dataset 정의: LSTM-AE 입력 시퀀스
# -----------------------------
class TrackSeqDataset(Dataset):
    def __init__(self, files, seq_len, scaler=None):
        self.seq_len = seq_len
        self.scaler = scaler or StandardScaler()
        data_all = []
        self.ids = []
        for f in files:
            df = pd.read_csv(f, parse_dates=['timestamp'])
            df = df.sort_values('timestamp').set_index('timestamp')
            df = df[['sog','cog','latitude','longitude']].resample('5s').mean().interpolate().dropna()
            arr = df.values
            data_all.append(arr)
            self.ids.append(os.path.splitext(os.path.basename(f))[0])
        concat = np.vstack(data_all)
        self.scaler.fit(concat)
        self.sequences = []
        self.seq_ids = []
        for arr, aid in zip(data_all, self.ids):
            normed = self.scaler.transform(arr)
            for i in range(len(normed) - seq_len + 1):
                self.sequences.append(normed[i:i+seq_len])
                self.seq_ids.append(aid)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.float32), self.seq_ids[idx]

In [4]:
# -----------------------------
# LSTM Autoencoder 모델
# -----------------------------
class LSTMAutoencoder(nn.Module):
    def __init__(self, seq_len, n_feat, latent_dim):
        super().__init__()
        self.encoder = nn.LSTM(n_feat, latent_dim, batch_first=True)
        self.decoder = nn.LSTM(latent_dim, n_feat, batch_first=True)
    def forward(self, x):
        _, (h, _) = self.encoder(x)
        latent = h[-1].unsqueeze(1).repeat(1, x.size(1), 1)
        out, _ = self.decoder(latent)
        return out

In [5]:
# -----------------------------
# 학습 함수
# -----------------------------
def train_autoencoder(dataset):
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    model = LSTMAutoencoder(SEQ_LEN, 4, LATENT_DIM).to(DEVICE)
    optim = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = nn.MSELoss()
    model.train()
    for ep in range(1, EPOCHS+1):
        total_loss = 0
        for seqs, _ in loader:
            seqs = seqs.to(DEVICE)
            optim.zero_grad()
            recon = model(seqs)
            loss = loss_fn(recon, seqs)
            loss.backward()
            optim.step()
            total_loss += loss.item()
        print(f"Epoch {ep}/{EPOCHS} | Loss: {total_loss/len(loader):.6f}")
    return model

In [6]:
# -----------------------------
# Reconstruction error 계산
# -----------------------------
def compute_errors(model, dataset):
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False)
    model.eval()
    errors = {aid: [] for aid in set(dataset.seq_ids)}
    with torch.no_grad():
        for seqs, aids in loader:
            seqs = seqs.to(DEVICE)
            recon = model(seqs)
            errs = ((recon - seqs)**2).mean(dim=(1,2)).cpu().numpy()
            for e, aid in zip(errs, aids):
                errors[aid].append(e)
    mean_err = {aid: np.mean(errs) for aid, errs in errors.items()}
    return mean_err

In [7]:
# -----------------------------
# 이상치 감지 함수
# -----------------------------
def detect_anomalies(mean_errs, threshold=None, percentile=95):
    errs = np.array(list(mean_errs.values()))
    if threshold is None:
        threshold = np.percentile(errs, percentile)
        print(f"Threshold set to {percentile}th percentile: {threshold:.6f}")
    anomalies = [aid for aid, e in mean_errs.items() if e > threshold]
    return anomalies, threshold

In [8]:
# -----------------------------
# Validation set 예측 및 제출 파일 생성
# -----------------------------
def predict_and_save(validation_file, track_dir, model, scaler, seq_len, threshold, output_file):
    val_df = pd.read_csv(validation_file)
    results = []
    for aid in val_df['Anonymized_MMSI'].astype(str):
        fpath = os.path.join(track_dir, f"{aid}.csv")
        if not os.path.exists(fpath):
            results.append({'Anonymized_MMSI': aid, 'result': 'FALSE'})
            continue
        df = pd.read_csv(fpath, parse_dates=['timestamp']).sort_values('timestamp').set_index('timestamp')
        df = df[['sog','cog','latitude','longitude']].resample('5s').mean().interpolate().dropna()
        arr = scaler.transform(df.values)
        seqs = []
        for i in range(len(arr) - seq_len + 1):
            seqs.append(arr[i:i+seq_len])
        if not seqs:
            mean_err = np.nan
        else:
            seqs_tensor = torch.tensor(np.stack(seqs), dtype=torch.float32).to(DEVICE)
            with torch.no_grad():
                recon = model(seqs_tensor)
            errs = ((recon - seqs_tensor)**2).mean(dim=(1,2)).cpu().numpy()
            mean_err = np.mean(errs)
        pred = True if mean_err > threshold else False
        results.append({'Anonymized_MMSI': aid, 'result': 'TRUE' if pred else 'FALSE'})
    out_df = pd.DataFrame(results)
    out_df.to_csv(output_file, index=False)
    print(f"Saved predictions to {output_file}")

In [None]:
# -----------------------------
# 메인 실행
# -----------------------------
if __name__ == '__main__':
    # Training 데이터셋 생성 및 학습
    dataset = TrackSeqDataset(files, SEQ_LEN)
    ae_model = train_autoencoder(dataset)
    torch.save(ae_model.state_dict(), 'lstm_ae_nk_detection.pth')

    # Training set 오류 계산 및 이상치 탐지 (Threshold 결정)
    mean_errors = compute_errors(ae_model, dataset)
    anomalies, thr = detect_anomalies(mean_errors)
    print(f"Detected anomalies (training): {anomalies}")

    # Validation set 예측 및 결과 저장
    validation_file = '/content/drive/MyDrive/25년 해군 AI 경진대회/2번문제/Anonymized_MMSI_list_validation(2번문제)_수정.csv'
    output_file = 'submission_nkmodel_ver1.csv'
    predict_and_save(validation_file, TRACK_DIR, ae_model, dataset.scaler, SEQ_LEN, thr, output_file)

  df = pd.read_csv(f, parse_dates=['timestamp'])
