In [29]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold
import glob
import os
from datetime import datetime
from tqdm import tqdm

# Mixed Precision Training 지원 확인
try:
    from torch.cuda.amp import autocast, GradScaler
    AMP_AVAILABLE = True
except ImportError:
    AMP_AVAILABLE = False
    print("Warning: Mixed Precision Training not available. Using standard training.")

In [67]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold
import glob
import os
from datetime import datetime
from tqdm import tqdm
import pickle
from multiprocessing import Pool, cpu_count
from functools import partial
import warnings
warnings.filterwarnings('ignore')

# Mixed Precision Training 지원 확인
try:
    from torch.cuda.amp import autocast, GradScaler
    AMP_AVAILABLE = True
except ImportError:
    AMP_AVAILABLE = False
    print("Warning: Mixed Precision Training not available. Using standard training.")


class VesselBehaviorAnalyzer:
    """선박 행동 패턴 분석기 - 속도 최적화 버전"""
    
    def __init__(self, data_path, meta_path):
        self.data_path = data_path
        self.meta_data = pd.read_csv(meta_path)
        self.nll_latitude = 38.0  # 북방한계선 대략적 위도
        
    def load_and_preprocess(self, mmsi):
        """데이터 로드 및 전처리 - 최적화된 버전"""
        file_path = os.path.join(self.data_path, f"{mmsi}.csv")

        # 파일 크기 확인
        file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB

        # 대용량 파일 처리
        if file_size > 50:  # 50MB 이상
            print(f"Large file detected: {mmsi} ({file_size:.1f} MB), using chunk reading...")
            chunks = []
            for chunk in pd.read_csv(file_path, chunksize=100000):
                chunks.append(chunk)
            df = pd.concat(chunks, ignore_index=True)
        else:
            df = pd.read_csv(file_path)

        # 데이터 포인트가 너무 많으면 샘플링
        MAX_POINTS = 1000000
        if len(df) > MAX_POINTS:
            print(f"Too many points: {mmsi} ({len(df)} points), sampling to {MAX_POINTS}...")
            step = len(df) // MAX_POINTS
            df = df.iloc[::step]

        # 시간대 처리
        df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
        df = df.sort_values('timestamp')

        # 중복 타임스탬프 처리 - drop_duplicates가 groupby보다 빠름
        df = df.drop_duplicates(subset=['timestamp'], keep='first')

        # 30초 간격 리샘플링으로 변경 (5초 -> 30초)
        df.set_index('timestamp', inplace=True)

        # 데이터가 충분히 조밀하면 리샘플링 생략
        time_diff = df.index.to_series().diff().dt.total_seconds()
        median_diff = time_diff.median()

        if median_diff > 60:  # 중간값이 60초 초과면 리샘플링
            df = df.resample('30s').mean()  # 30초 간격으로 변경
            df = df.interpolate(method='linear', limit=3)

        df = df.dropna()

        return df
    
    def extract_behavioral_features(self, df):
        """행동 패턴 기반 특징 추출 - 벡터화 최적화"""
        features = {}
        
        # NumPy 배열로 변환하여 더 빠른 연산
        lat = df['latitude'].values
        lon = df['longitude'].values
        sog = df['sog'].values
        cog = df['cog'].values
        
        # 1. 속도 패턴 분석 - 벡터화
        features['avg_speed'] = np.mean(sog)
        features['speed_std'] = np.std(sog) if len(sog) > 1 else 0
        features['speed_q25'] = np.percentile(sog, 25)
        features['speed_q75'] = np.percentile(sog, 75)
        features['zero_speed_ratio'] = np.mean(sog < 0.5)
        features['high_speed_ratio'] = np.mean(sog > 15)
        
        # 속도 변화율
        if len(sog) > 1:
            speed_change = np.abs(np.diff(sog))
            features['avg_speed_change'] = np.mean(speed_change)
            features['sudden_speed_changes'] = int(np.sum(speed_change > 5))
        else:
            features['avg_speed_change'] = 0
            features['sudden_speed_changes'] = 0
        
        # 2. 방향 패턴 분석 - 벡터화
        if len(cog) > 1:
            cog_diff = np.diff(cog)
            # 각도 차이를 -180 ~ 180 범위로 정규화
            cog_diff = (cog_diff + 180) % 360 - 180
            
            features['avg_course_change'] = np.mean(np.abs(cog_diff))
            features['zigzag_score'] = np.std(cog_diff)
            features['sharp_turns'] = int(np.sum(np.abs(cog_diff) > 45))
            features['u_turns'] = int(np.sum(np.abs(cog_diff) > 150))
        else:
            features['avg_course_change'] = 0
            features['zigzag_score'] = 0
            features['sharp_turns'] = 0
            features['u_turns'] = 0
        
        # 3. 공간 패턴 분석
        features['lat_range'] = lat.max() - lat.min()
        features['lon_range'] = lon.max() - lon.min()
        features['max_latitude'] = lat.max()
        features['min_latitude'] = lat.min()
        
        # NLL 관련 특징
        features['north_of_nll_ratio'] = np.mean(lat > self.nll_latitude)
        features['nll_crossings'] = self._count_nll_crossings_fast(lat)
        features['max_north_distance'] = lat.max() - self.nll_latitude
        
        # 4. 이동 패턴 분석 - 벡터화된 거리 계산
        features['total_distance'] = self._calculate_trajectory_distance_vectorized(lat, lon)
        features['displacement'] = self._calculate_displacement_fast(lat, lon)
        features['trajectory_efficiency'] = (
            features['displacement'] / features['total_distance'] 
            if features['total_distance'] > 0 else 0
        )
        
        # 5. 정박/대기 패턴 - 벡터화
        stationary_features = self._find_stationary_periods_vectorized(sog)
        features.update(stationary_features)
        
        # 6. 활동 시간 패턴
        if hasattr(df.index, 'hour'):
            hours = df.index.hour
            features['night_activity_ratio'] = np.mean((hours >= 20) | (hours <= 5))
            features['dawn_activity_ratio'] = np.mean((hours >= 3) & (hours <= 6))
        else:
            features['night_activity_ratio'] = 0
            features['dawn_activity_ratio'] = 0
        
        # 7. 궤적 복잡도 - 제거됨 (heading_entropy, location_revisits)
        # features['heading_entropy'] = self._calculate_heading_entropy_fast(cog)
        # features['location_revisits'] = self._count_location_revisits_fast(lat, lon)
        
        return features
    
    def _count_nll_crossings_fast(self, lat):
        """NLL 경계선 통과 횟수 - 벡터화"""
        north_flags = lat > self.nll_latitude
        if len(north_flags) > 1:
            crossings = np.sum(np.diff(north_flags.astype(int)) != 0)
        else:
            crossings = 0
        return int(crossings)
    
    def _calculate_trajectory_distance_vectorized(self, lat, lon):
        """벡터화된 총 이동 거리 계산"""
        if len(lat) < 2:
            return 0
        
        # 라디안 변환
        lat_rad = np.radians(lat)
        lon_rad = np.radians(lon)
        
        # 연속된 점들 간의 차이
        dlat = lat_rad[1:] - lat_rad[:-1]
        dlon = lon_rad[1:] - lon_rad[:-1]
        
        # Haversine 공식 벡터화
        a = (np.sin(dlat/2)**2 + 
             np.cos(lat_rad[:-1]) * np.cos(lat_rad[1:]) * 
             np.sin(dlon/2)**2)
        c = 2 * np.arcsin(np.sqrt(np.minimum(1, a)))  # 수치 안정성
        
        distances = 6371 * c  # 지구 반지름 (km)
        return float(np.sum(distances))
    
    def _calculate_displacement_fast(self, lat, lon):
        """시작점-끝점 직선 거리 - 벡터화"""
        if len(lat) < 1:
            return 0
            
        lat1, lon1 = np.radians(lat[0]), np.radians(lon[0])
        lat2, lon2 = np.radians(lat[-1]), np.radians(lon[-1])
        
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(np.minimum(1, a)))
        
        return float(6371 * c)
    
    def _find_stationary_periods_vectorized(self, sog):
        """정박 기간 찾기 - 벡터화"""
        stationary = sog < 0.5
        
        if len(stationary) == 0:
            return {
                'num_stops': 0,
                'avg_stop_duration': 0,
                'longest_stop': 0
            }
        
        # 상태 변화 감지
        padded = np.concatenate([[False], stationary, [False]])
        diff = np.diff(padded.astype(int))
        starts = np.where(diff == 1)[0]
        ends = np.where(diff == -1)[0]
        
        if len(starts) == 0 or len(ends) == 0:
            return {
                'num_stops': 0,
                'avg_stop_duration': 0,
                'longest_stop': 0
            }
        
        # 시작과 끝 매칭
        if ends[0] < starts[0]:
            ends = ends[1:]
        if len(starts) > len(ends):
            starts = starts[:len(ends)]
        
        durations = (ends - starts) * 30  # 30초 간격으로 변경
        
        return {
            'num_stops': int(len(starts)),
            'avg_stop_duration': float(np.mean(durations)) if len(durations) > 0 else 0,
            'longest_stop': float(np.max(durations)) if len(durations) > 0 else 0
        }
    
    def _calculate_heading_entropy_fast(self, cog):
        """방향 엔트로피 계산 - 최적화"""
        if len(cog) == 0 or np.all(np.isnan(cog)):
            return 0
            
        # NaN 제거
        cog_clean = cog[~np.isnan(cog)]
        if len(cog_clean) == 0:
            return 0
        
        # 8방향으로 양자화
        bins = np.arange(0, 361, 45)
        hist, _ = np.histogram(cog_clean, bins=bins)
        
        # 확률 계산
        hist = hist + 1e-10  # 0 방지
        hist = hist / hist.sum()
        
        # 엔트로피
        entropy = -np.sum(hist * np.log(hist))
        return float(entropy)
    
    def _count_location_revisits_fast(self, lat, lon):
        """위치 재방문 횟수 - 최적화"""
        if len(lat) == 0:
            return 0
            
        # 그리드 양자화
        lat_grid = np.round(lat * 100).astype(int)
        lon_grid = np.round(lon * 100).astype(int)
        
        # 유니크 위치와 카운트
        locations = np.column_stack([lat_grid, lon_grid])
        unique_locs, counts = np.unique(locations, axis=0, return_counts=True)
        
        return int(np.sum(counts > 1))


class SequenceFeatureExtractor:
    """시퀀스 데이터에서 추가 특징 추출"""
    
    def __init__(self, window_size=20):
        self.window_size = window_size
        
    def extract_sequence_features(self, df):
        """슬라이딩 윈도우 기반 시퀀스 특징 - rolling window 제거"""
        features = []
        
        # 기본 특징
        base_features = ['latitude', 'longitude', 'sog', 'cog']
        
        # 파생 특징 추가
        df['lat_diff'] = df['latitude'].diff()
        df['lon_diff'] = df['longitude'].diff()
        df['speed_acc'] = df['sog'].diff()
        df['course_change'] = df['cog'].diff().abs()
        
        # rolling window 제거 - 불필요한 연산 제거
        # df['sog_ma'] = df['sog'].rolling(window=5, min_periods=1).mean()
        # df['cog_ma'] = df['cog'].rolling(window=5, min_periods=1).mean()
        # df['sog_std'] = df['sog'].rolling(window=10, min_periods=1).std()
        # df['cog_std'] = df['cog'].rolling(window=10, min_periods=1).std()
        
        # 북방 이동 지표
        df['northward'] = (df['lat_diff'] > 0).astype(int)
        df['near_nll'] = (df['latitude'] > 37.5).astype(int)
        
        feature_cols = base_features + ['lat_diff', 'lon_diff', 'speed_acc', 
                                       'course_change', 'northward', 'near_nll']
        
        return df[feature_cols].fillna(0).values


def process_single_vessel(args):
    """병렬 처리를 위한 단일 선박 처리 함수"""
    mmsi, data_path, meta_path, label = args
    
    try:
        # 분석기 생성
        analyzer = VesselBehaviorAnalyzer(data_path, meta_path)
        seq_extractor = SequenceFeatureExtractor()
        
        # 데이터 로드 및 전처리
        vessel_data = analyzer.load_and_preprocess(mmsi)
        
        if len(vessel_data) < 50:
            return None
        
        # 특징 추출
        behavioral_features = analyzer.extract_behavioral_features(vessel_data)
        sequence_features = seq_extractor.extract_sequence_features(vessel_data)
        
        return {
            'mmsi': mmsi,
            'sequence': sequence_features,
            'features': behavioral_features,
            'label': label
        }
        
    except Exception as e:
        return None


def load_data_parallel(data_path, meta_path, valid_meta_data, n_jobs=None):
    """병렬 처리를 사용한 데이터 로드"""
    if n_jobs is None:
        n_jobs = min(cpu_count() - 1, 8)
    
    print(f"\nUsing {n_jobs} parallel workers for preprocessing...")
    
    # 병렬 처리를 위한 인자 준비
    args_list = []
    for idx, row in valid_meta_data.iterrows():
        if 'MMSI' in valid_meta_data.columns:
            mmsi = str(row['MMSI'])
        else:
            mmsi = str(row['mmsi'])
        
        if 'label' in valid_meta_data.columns:
            label = int(row['label'])  # boolean을 int로 변환
        else:
            label = int(row['result'])
        
        args_list.append((mmsi, data_path, meta_path, label))
    
    # 병렬 처리
    with Pool(processes=n_jobs) as pool:
        results = list(tqdm(
            pool.imap(process_single_vessel, args_list),
            total=len(args_list),
            desc="Processing vessels"
        ))
    
    # 유효한 결과만 필터링
    valid_results = [r for r in results if r is not None]
    
    # 결과 분리
    all_sequences = [r['sequence'] for r in valid_results]
    all_features = [r['features'] for r in valid_results]
    all_labels = [r['label'] for r in valid_results]
    all_mmsi = [r['mmsi'] for r in valid_results]
    
    return all_sequences, all_features, all_labels, all_mmsi


class VesselDataset(Dataset):
    """PyTorch Dataset for vessel trajectories"""
    
    def __init__(self, data_list, labels, seq_length=200, augment=True):
        self.data_list = data_list
        self.labels = labels
        self.seq_length = seq_length
        self.augment = augment
        
    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, idx):
        data = self.data_list[idx]
        label = self.labels[idx]
        
        # 데이터 증강
        if self.augment and np.random.random() > 0.5:
            data = self._augment_sequence(data)
        
        # 시퀀스 길이 조정
        if len(data) > self.seq_length:
            # 랜덤 크롭
            start = np.random.randint(0, len(data) - self.seq_length)
            data = data[start:start + self.seq_length]
        else:
            # 패딩
            pad_length = self.seq_length - len(data)
            padding = np.zeros((pad_length, data.shape[1]))
            data = np.vstack([data, padding])
            
        return torch.FloatTensor(data), torch.FloatTensor([label])
    
    def _augment_sequence(self, data):
        """데이터 증강"""
        augmented = data.copy()
        
        # 1. 노이즈 추가
        if np.random.random() > 0.5:
            noise = np.random.normal(0, 0.01, augmented.shape)
            augmented[:, :4] += noise[:, :4]  # 위치와 속도에만 노이즈
            
        # 2. 시간 축 스케일링 (일부 구간 스킵)
        if np.random.random() > 0.5:
            indices = np.sort(np.random.choice(len(augmented), 
                                             int(len(augmented) * 0.9), 
                                             replace=False))
            augmented = augmented[indices]
            
        return augmented


class SimpleLSTM(nn.Module):
    """단순화된 LSTM 모델 (autocast 호환)"""
    
    def __init__(self, input_dim=10, hidden_dim=64, num_layers=2, dropout=0.2):
        super().__init__()
        
        # 입력 정규화
        self.input_norm = nn.BatchNorm1d(input_dim)
        
        # LSTM layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, 
                           num_layers=num_layers,
                           batch_first=True, 
                           bidirectional=True,
                           dropout=dropout if num_layers > 1 else 0)
        
        # Global average pooling 대신 마지막 hidden state 사용
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 1)  # logits 출력 (sigmoid 제거)
        )
        
    def forward(self, x):
        batch_size, seq_len, features = x.size()
        
        # 입력 정규화 (시퀀스 차원을 평탄화)
        x = x.reshape(-1, features)
        x = self.input_norm(x)
        x = x.reshape(batch_size, seq_len, features)
        
        # LSTM
        lstm_out, (hidden, _) = self.lstm(x)
        
        # 마지막 타임스텝의 hidden state 사용
        # hidden shape: (num_layers * 2, batch, hidden_dim)
        # 양방향 LSTM의 마지막 레이어 출력을 결합
        forward_hidden = hidden[-2, :, :]  # 마지막 레이어의 forward
        backward_hidden = hidden[-1, :, :]  # 마지막 레이어의 backward
        combined_hidden = torch.cat([forward_hidden, backward_hidden], dim=1)
        
        # Classification (logits 반환)
        logits = self.classifier(combined_hidden)
        return logits


class OptimizedFocalLoss(nn.Module):
    """클래스 분포를 반영한 최적화된 Focal Loss (logits 입력용)"""
    
    def __init__(self, alpha=0.19, gamma=2.0):
        super().__init__()
        self.alpha = alpha  # TRUE 클래스 비율 (19%)
        self.gamma = gamma
        
    def forward(self, inputs, targets):
        # inputs는 이제 logits (sigmoid 적용 전)
        # binary_cross_entropy_with_logits 사용 (autocast 안전)
        bce_loss = nn.functional.binary_cross_entropy_with_logits(
            inputs, targets, reduction='none'
        )
        
        # 예측 확률 계산
        pt = torch.sigmoid(inputs) * targets + (1 - torch.sigmoid(inputs)) * (1 - targets)
        
        # 클래스별 가중치 적용
        alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
        focal_loss = alpha_t * (1 - pt) ** self.gamma * bce_loss
        
        return focal_loss.mean()


def find_optimal_threshold(y_true, y_pred_proba):
    """F1 Score를 최대화하는 최적 임계값 찾기"""
    thresholds = np.arange(0.3, 0.9, 0.01)
    best_f1 = 0
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_pred_proba > threshold).astype(int)
        
        # F1 계산
        tp = ((y_pred == 1) & (y_true == 1)).sum()
        fp = ((y_pred == 1) & (y_true == 0)).sum()
        fn = ((y_pred == 0) & (y_true == 1)).sum()
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    return best_threshold, best_f1


def train_model_with_strategy(model, train_loader, val_loader, epochs=50, device='cuda'):
    """클래스 분포를 고려한 전략적 모델 학습 (GPU 최적화 적용)"""
    model = model.to(device)
    
    # 최적화된 손실 함수 (테스트셋 TRUE 비율 19% 반영)
    criterion = OptimizedFocalLoss(alpha=0.19, gamma=2.0)
    
    # 단순화된 옵티마이저
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, patience=5, factor=0.5, mode='max'
    )
    
    # Mixed Precision Training을 위한 GradScaler
    scaler = GradScaler() if AMP_AVAILABLE else None
    
    best_val_f1 = 0
    best_threshold = 0.5
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        train_preds = []
        train_labels = []
        
        for batch_x, batch_y in tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}'):
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            
            if AMP_AVAILABLE:
                # Mixed Precision Training
                with autocast():
                    logits = model(batch_x)  # 이제 logits 반환
                    loss = criterion(logits, batch_y)
                
                # Scaled backward pass
                scaler.scale(loss).backward()
                
                # Gradient clipping
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                
                # Optimizer step with scaler
                scaler.step(optimizer)
                scaler.update()
            else:
                # Standard training
                logits = model(batch_x)
                loss = criterion(logits, batch_y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
            
            train_loss += loss.item()
            # 예측을 위해 sigmoid 적용
            train_preds.extend(torch.sigmoid(logits).detach().cpu().numpy())
            train_labels.extend(batch_y.cpu().numpy())
        
        # Validation
        model.eval()
        val_preds = []
        val_labels = []
        
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                
                if AMP_AVAILABLE:
                    # Mixed Precision for validation
                    with autocast():
                        logits = model(batch_x)
                else:
                    logits = model(batch_x)
                
                # 예측을 위해 sigmoid 적용
                val_preds.extend(torch.sigmoid(logits).cpu().numpy())
                val_labels.extend(batch_y.cpu().numpy())
        
        # 최적 임계값 찾기
        val_preds = np.array(val_preds)
        val_labels = np.array(val_labels)
        
        current_threshold, current_f1 = find_optimal_threshold(val_labels, val_preds)
        
        # 기본 임계값(0.5)로도 평가
        val_preds_binary = (val_preds > 0.5).astype(int)
        default_f1 = f1_score(val_labels, val_preds_binary)
        
        print(f'Epoch {epoch+1}: Loss={train_loss/len(train_loader):.4f}, '
              f'F1@0.5={default_f1:.4f}, F1@opt={current_f1:.4f} (threshold={current_threshold:.3f})')
        
        # 최고 성능 모델 저장
        if current_f1 > best_val_f1:
            best_val_f1 = current_f1
            best_threshold = current_threshold
            torch.save({
                'model_state_dict': model.state_dict(),
                'threshold': best_threshold,
                'f1_score': best_val_f1
            }, 'best_vessel_model_strategic.pth')
        
        scheduler.step(current_f1)
    
    print(f"\nBest validation F1: {best_val_f1:.4f} at threshold {best_threshold:.3f}")
    
    return model, best_threshold


def quick_test(data_path='data/', meta_path='meta_data.csv', test_size=10):
    """소량의 데이터로 빠른 테스트 실행"""
    print("="*50)
    print(f"QUICK TEST MODE - Testing with {test_size} samples")
    print("="*50)
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")
    
    # 데이터 분석기 초기화
    analyzer = VesselBehaviorAnalyzer(data_path, meta_path)
    seq_extractor = SequenceFeatureExtractor()
    
    # 메타데이터 로드 및 샘플링
    if os.path.exists(meta_path):
        # 샘플 데이터만 선택
        sample_meta = analyzer.meta_data.sample(n=min(test_size, len(analyzer.meta_data)), random_state=42)
        print(f"Sampled {len(sample_meta)} vessels for testing")
    else:
        print(f"Error: meta_data file not found at {meta_path}")
        return False
    
    # 데이터 로드 테스트
    success_count = 0
    all_sequences = []
    all_labels = []
    
    print("\nTesting data loading...")
    for idx, row in sample_meta.iterrows():
        # MMSI 컬럼명 처리
        if 'MMSI' in sample_meta.columns:
            mmsi = str(row['MMSI'])
        else:
            mmsi = str(row['mmsi'])
        
        # 레이블 컬럼명 처리
        if 'label' in sample_meta.columns:
            label = int(row['label'])  # boolean을 int로 변환
        else:
            label = int(row['result'])
        
        try:
            file_path = os.path.join(data_path, f"{mmsi}.csv")
            if not os.path.exists(file_path):
                print(f"  - File not found: {file_path}")
                continue
            
            # 데이터 로드
            vessel_data = analyzer.load_and_preprocess(mmsi)
            
            if len(vessel_data) < 50:
                print(f"  - MMSI {mmsi}: Too few data points ({len(vessel_data)})")
                continue
            
            # 특징 추출
            behavioral_features = analyzer.extract_behavioral_features(vessel_data)
            sequence_features = seq_extractor.extract_sequence_features(vessel_data)
            
            all_sequences.append(sequence_features)
            all_labels.append(label)
            success_count += 1
            print(f"  ✓ MMSI {mmsi}: Successfully processed ({len(vessel_data)} points)")
            
        except Exception as e:
            print(f"  ✗ MMSI {mmsi}: Error - {str(e)}")
            continue
    
    print(f"\nSuccessfully loaded: {success_count}/{len(sample_meta)} vessels")
    
    if success_count < 2:
        print("ERROR: Not enough data for testing. Need at least 2 samples.")
        return False
    
    # 간단한 모델 테스트
    print("\nTesting model training...")
    try:
        # 데이터셋 생성
        all_labels = np.array(all_labels)
        dataset = VesselDataset(all_sequences, all_labels, augment=True)
        dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
        
        # 모델 초기화 - input_dim 변경 (14 -> 10)
        model = SimpleLSTM(input_dim=10).to(device)
        criterion = OptimizedFocalLoss(alpha=0.19, gamma=2.0)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        
        # 1 에폭만 테스트
        model.train()
        for batch_x, batch_y in dataloader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            # Forward pass
            if AMP_AVAILABLE:
                try:
                    from torch.cuda.amp import autocast
                    with autocast():
                        logits = model(batch_x)
                        loss = criterion(logits, batch_y)
                except:
                    logits = model(batch_x)
                    loss = criterion(logits, batch_y)
            else:
                logits = model(batch_x)
                loss = criterion(logits, batch_y)
            
            print(f"  ✓ Forward pass successful")
            print(f"    - Input shape: {batch_x.shape}")
            print(f"    - Output shape: {logits.shape}")
            print(f"    - Loss: {loss.item():.4f}")
            
            # Backward pass
            loss.backward()
            optimizer.step()
            print(f"  ✓ Backward pass successful")
            
            break  # 한 배치만 테스트
        
        print("\n✓ All tests passed! Ready for full training.")
        return True
        
    except Exception as e:
        print(f"\n✗ Model test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False


def full_pipeline_test(data_path='data/', meta_path='meta_data.csv', test_size=20, epochs=3):
    """전체 파이프라인을 소량의 데이터로 테스트 (학습 포함)"""
    print("="*70)
    print(f"FULL PIPELINE TEST MODE")
    print(f"Testing with {test_size} samples and {epochs} epochs")
    print("="*70)
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")
    
    # 1. 데이터 로드 및 전처리 테스트
    print("\n[Step 1/5] Data Loading and Preprocessing")
    print("-"*50)
    
    analyzer = VesselBehaviorAnalyzer(data_path, meta_path)
    seq_extractor = SequenceFeatureExtractor()
    
    # 샘플 데이터 선택 (클래스 균형 유지)
    if 'label' in analyzer.meta_data.columns:
        label_col = 'label'
        true_samples = analyzer.meta_data[analyzer.meta_data[label_col] == True]
        false_samples = analyzer.meta_data[analyzer.meta_data[label_col] == False]
    else:
        label_col = 'result'
        true_samples = analyzer.meta_data[analyzer.meta_data[label_col] == True]
        false_samples = analyzer.meta_data[analyzer.meta_data[label_col] == False]
    
    # 샘플링
    true_samples = true_samples.sample(
        n=min(test_size//2, len(true_samples)), 
        random_state=42
    )
    false_samples = false_samples.sample(
        n=min(test_size//2, len(false_samples)), 
        random_state=42
    )
    sample_meta = pd.concat([true_samples, false_samples])
    
    print(f"Sampled {len(sample_meta)} vessels (TRUE: {len(true_samples)}, FALSE: {len(false_samples)})")
    
    # 데이터 로드
    all_sequences = []
    all_labels = []
    all_mmsi = []
    
    for idx, row in sample_meta.iterrows():
        mmsi = str(row.get('MMSI', row.get('mmsi')))
        label = int(row[label_col])  # boolean을 int로 변환
        
        try:
            vessel_data = analyzer.load_and_preprocess(mmsi)
            if len(vessel_data) < 50:
                continue
                
            behavioral_features = analyzer.extract_behavioral_features(vessel_data)
            sequence_features = seq_extractor.extract_sequence_features(vessel_data)
            
            all_sequences.append(sequence_features)
            all_labels.append(label)
            all_mmsi.append(mmsi)
            
        except Exception as e:
            continue
    
    print(f"Successfully loaded: {len(all_sequences)} vessels")
    
    if len(all_sequences) < 4:
        print("ERROR: Not enough data for pipeline test. Need at least 4 samples.")
        return False
    
    # 2. Train/Validation Split 테스트
    print("\n[Step 2/5] Train/Validation Split")
    print("-"*50)
    
    from sklearn.model_selection import train_test_split
    all_labels = np.array(all_labels)
    
    X_train_idx, X_val_idx, y_train, y_val = train_test_split(
        np.arange(len(all_sequences)), all_labels, 
        test_size=0.3, 
        stratify=all_labels, 
        random_state=42
    )
    
    train_sequences = [all_sequences[i] for i in X_train_idx]
    val_sequences = [all_sequences[i] for i in X_val_idx]
    
    print(f"Train set: {len(train_sequences)} vessels (TRUE: {y_train.sum()}, FALSE: {len(y_train)-y_train.sum()})")
    print(f"Val set: {len(val_sequences)} vessels (TRUE: {y_val.sum()}, FALSE: {len(y_val)-y_val.sum()})")
    
    # 3. 데이터셋 및 DataLoader 생성 테스트
    print("\n[Step 3/5] Dataset and DataLoader Creation")
    print("-"*50)
    
    train_dataset = VesselDataset(train_sequences, y_train, augment=True)
    val_dataset = VesselDataset(val_sequences, y_val, augment=False)
    
    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
    
    print(f"Train DataLoader: {len(train_loader)} batches")
    print(f"Val DataLoader: {len(val_loader)} batches")
    
    # 4. 모델 학습 테스트
    print("\n[Step 4/5] Model Training")
    print("-"*50)
    
    model = SimpleLSTM(input_dim=10).to(device)
    criterion = OptimizedFocalLoss(alpha=0.19, gamma=2.0)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            logits = model(batch_x)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        val_preds = []
        val_true = []
        
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                logits = model(batch_x)
                loss = criterion(logits, batch_y)
                val_loss += loss.item()
                
                # sigmoid 적용하여 확률로 변환
                val_preds.extend(torch.sigmoid(logits).cpu().numpy())
                val_true.extend(batch_y.cpu().numpy())
        
        # 성능 평가
        val_preds = np.array(val_preds).flatten()
        val_true = np.array(val_true).flatten()
        val_binary = (val_preds > 0.5).astype(int)
        
        if val_true.sum() > 0 and (1-val_true).sum() > 0:  # 두 클래스 모두 있는 경우
            val_f1 = f1_score(val_true, val_binary)
        else:
            val_f1 = 0.0
        
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss/len(train_loader):.4f}, "
              f"Val Loss: {val_loss/len(val_loader):.4f}, Val F1: {val_f1:.4f}")
    
    # 5. 예측 및 평가 테스트
    print("\n[Step 5/5] Prediction and Evaluation")
    print("-"*50)
    
    # 최종 예측
    final_preds = (val_preds > 0.5).astype(int)
    
    print(f"Predictions: TRUE={final_preds.sum()}, FALSE={len(final_preds)-final_preds.sum()}")
    print(f"Ground Truth: TRUE={val_true.sum()}, FALSE={len(val_true)-val_true.sum()}")
    
    if val_true.sum() > 0 and (1-val_true).sum() > 0:
        print("\nClassification Report:")
        print(classification_report(val_true, final_preds, 
                                  target_names=['Non-NK', 'NK vessel'], 
                                  zero_division=0))
    
    # 모델 저장 테스트
    try:
        torch.save({
            'model_state_dict': model.state_dict(),
            'test_completed': True
        }, 'test_model.pth')
        print("\n✓ Model save test successful")
        os.remove('test_model.pth')  # 테스트 파일 삭제
    except Exception as e:
        print(f"\n✗ Model save test failed: {str(e)}")
        return False
    
    print("\n" + "="*70)
    print("✓ ALL PIPELINE TESTS PASSED!")
    print("="*70)
    
    return True


def main():
    # 설정
    data_path = './tracks'              # 실제 데이터 경로로 변경하세요
    meta_path = './meta_data.csv'      # 실제 메타데이터 경로로 변경하세요
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # 테스트 모드 선택
    print("="*70)
    print("VESSEL DETECTION MODEL TRAINING")
    print("="*70)
    print("\nSelect mode:")
    print("1. Quick test (basic functionality check)")
    print("2. Full pipeline test (complete test with small data)")
    print("3. Full training (complete training with all data)")
    
    mode = input("\nEnter mode (1/2/3): ").strip()
    
    if mode == '1':
        # 빠른 테스트
        print("\nRunning quick test...")
        if not quick_test(data_path, meta_path, test_size=10):
            print("\nQuick test failed. Please check the errors above.")
            return
        print("\nQuick test completed successfully!")
        
    elif mode == '2':
        # 전체 파이프라인 테스트
        print("\nRunning full pipeline test...")
        if not full_pipeline_test(data_path, meta_path, test_size=20, epochs=3):
            print("\nPipeline test failed. Please check the errors above.")
            return
        print("\nPipeline test completed successfully!")
        
        response = input("\nContinue with full training? (y/n): ")
        if response.lower() != 'y':
            print("Training cancelled.")
            return
            
    elif mode == '3':
        # 전체 학습 진행
        pass
    else:
        print("Invalid mode selected. Exiting...")
        return
    
    # 전체 학습 코드 (mode 2에서 계속하거나 mode 3에서 바로 시작)
    print("\n" + "="*70)
    print("Starting full training...")
    print("="*70 + "\n")
    
    # 데이터 분석기 초기화
    analyzer = VesselBehaviorAnalyzer(data_path, meta_path)
    seq_extractor = SequenceFeatureExtractor()
    
    # 캐시 확인
    cache_file = 'preprocessed_data_cache.pkl'
    use_cache = False
    
    if os.path.exists(cache_file):
        print(f"\nCache file found: {cache_file}")
        response = input("Use cached preprocessed data? (y/n) [default: y]: ").strip() or 'y'
        use_cache = response.lower() == 'y'
    
    if use_cache:
        print("\nLoading preprocessed data from cache...")
        with open(cache_file, 'rb') as f:
            cache_data = pickle.load(f)
            all_sequences = cache_data['sequences']
            all_features = cache_data['features']
            all_labels = cache_data['labels']
            all_mmsi = cache_data['mmsi']
        print(f"Loaded {len(all_sequences)} vessels from cache")
    else:
        # 파일 존재 여부 체크
        print("Checking data availability...")
        available_files = set()
        for f in os.listdir(data_path):
            if f.endswith('.csv') and f != 'meta_data.csv':
                available_files.add(f.replace('.csv', ''))
        
        # MMSI 컬럼명 자동 감지
        if 'MMSI' in analyzer.meta_data.columns:
            meta_mmsi = set(analyzer.meta_data['MMSI'].astype(str))
        else:
            meta_mmsi = set(analyzer.meta_data['mmsi'].astype(str))
        
        # 실제 사용 가능한 MMSI 찾기
        valid_mmsi = available_files & meta_mmsi
        files_without_label = available_files - meta_mmsi
        meta_without_file = meta_mmsi - available_files
        
        print(f"Total MMSI in meta_data: {len(meta_mmsi)}")
        print(f"Available CSV files: {len(available_files)}")
        print(f"Valid MMSI (file + label): {len(valid_mmsi)}")
        
        if files_without_label:
            print(f"Warning: {len(files_without_label)} files without labels in meta_data")
        if meta_without_file:
            print(f"Warning: {len(meta_without_file)} MMSI in meta_data without files")
        
        # 유효한 MMSI만 필터링
        if 'MMSI' in analyzer.meta_data.columns:
            valid_meta_data = analyzer.meta_data[analyzer.meta_data['MMSI'].astype(str).isin(valid_mmsi)]
        else:
            valid_meta_data = analyzer.meta_data[analyzer.meta_data['mmsi'].astype(str).isin(valid_mmsi)]
        
        # 클래스 분포 확인
        if 'label' in valid_meta_data.columns:
            true_count = (valid_meta_data['label'] == True).sum()  # boolean True
            false_count = (valid_meta_data['label'] == False).sum()  # boolean False
        else:
            true_count = (valid_meta_data['result'] == True).sum()
            false_count = (valid_meta_data['result'] == False).sum()
        
        print(f"\nClass distribution in valid data:")
        print(f"TRUE (NK vessels): {true_count} ({true_count/len(valid_meta_data)*100:.1f}%)")
        print(f"FALSE (non-NK vessels): {false_count} ({false_count/len(valid_meta_data)*100:.1f}%)")
        
        # 병렬 처리 옵션
        use_parallel = input("\nUse parallel processing? (y/n) [default: y]: ").strip() or 'y'
        
        if use_parallel.lower() == 'y':
            # 병렬 처리
            all_sequences, all_features, all_labels, all_mmsi = load_data_parallel(
                data_path, meta_path, valid_meta_data
            )
        else:
            # 기존 순차 처리 (최적화된 버전)
            all_sequences = []
            all_features = []
            all_labels = []
            all_mmsi = []
            failed_mmsi = []
            
            print("\nLoading and processing vessel data...")
            for idx, row in tqdm(valid_meta_data.iterrows(), total=len(valid_meta_data)):
                # MMSI 컬럼명 처리
                if 'MMSI' in valid_meta_data.columns:
                    mmsi = str(row['MMSI'])
                else:
                    mmsi = str(row['mmsi'])
                
                # 레이블 컬럼명 처리
                if 'label' in valid_meta_data.columns:
                    label = int(row['label'])  # boolean을 int로 변환
                else:
                    label = int(row['result'])
                
                try:
                    # 파일 존재 확인
                    file_path = os.path.join(data_path, f"{mmsi}.csv")
                    if not os.path.exists(file_path):
                        failed_mmsi.append(mmsi)
                        continue
                        
                    # 데이터 로드 및 전처리
                    vessel_data = analyzer.load_and_preprocess(mmsi)
                    
                    # 최소 데이터 포인트 체크 (너무 짧은 궤적 제외)
                    if len(vessel_data) < 50:
                        print(f"Skipping MMSI {mmsi}: insufficient data points ({len(vessel_data)})")
                        continue
                    
                    # 행동 특징 추출
                    behavioral_features = analyzer.extract_behavioral_features(vessel_data)
                    
                    # 시퀀스 특징 추출
                    sequence_features = seq_extractor.extract_sequence_features(vessel_data)
                    
                    all_sequences.append(sequence_features)
                    all_features.append(behavioral_features)
                    all_labels.append(label)
                    all_mmsi.append(mmsi)
                    
                except Exception as e:
                    print(f"\nError processing MMSI {mmsi}: {e}")
                    failed_mmsi.append(mmsi)
                    continue
            
            if failed_mmsi:
                print(f"Failed to process: {len(failed_mmsi)} vessels")
        
        print(f"\nSuccessfully processed: {len(all_sequences)} vessels")
        
        # 캐시 저장
        print("\nSaving preprocessed data to cache...")
        cache_data = {
            'sequences': all_sequences,
            'features': all_features,
            'labels': all_labels,
            'mmsi': all_mmsi
        }
        with open(cache_file, 'wb') as f:
            pickle.dump(cache_data, f)
        print(f"Cache saved to {cache_file}")
    
    if len(all_sequences) == 0:
        print("No data was successfully loaded. Exiting...")
        return
    
    # NumPy 배열로 변환
    all_labels = np.array(all_labels)
    all_mmsi = np.array(all_mmsi)
    
    # Train/Test 분리
    print("\n" + "="*50)
    print("Train/Test Split")
    print("="*50)
    
    # sklearn의 train_test_split 사용
    from sklearn.model_selection import train_test_split
    
    # 전체 데이터의 20%를 test set으로 분리
    X_temp = np.arange(len(all_sequences))
    X_train_idx, X_test_idx, y_train, y_test = train_test_split(
        X_temp, all_labels, 
        test_size=0.2, 
        stratify=all_labels, 
        random_state=42
    )
    
    # 실제 데이터 분리
    train_sequences = [all_sequences[i] for i in X_train_idx]
    train_labels = y_train
    train_mmsi = [all_mmsi[i] for i in X_train_idx]
    
    test_sequences = [all_sequences[i] for i in X_test_idx]
    test_labels = y_test
    test_mmsi = [all_mmsi[i] for i in X_test_idx]
    
    print(f"Train set size: {len(train_sequences)} vessels")
    print(f"Test set size: {len(test_sequences)} vessels")
    print(f"Train set - TRUE: {(train_labels == 1).sum()}, FALSE: {(train_labels == 0).sum()}")
    print(f"Test set - TRUE: {(test_labels == 1).sum()}, FALSE: {(test_labels == 0).sum()}")
    
    # Test MMSI 리스트 저장
    with open('test_mmsi_list.txt', 'w') as f:
        for mmsi in test_mmsi:
            f.write(f"{mmsi}\n")
    print("\nTest MMSI list saved to 'test_mmsi_list.txt'")
    
    # K-Fold Cross Validation (train set에서만)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    best_models = []
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_sequences, train_labels)):
        print(f"\n{'='*50}")
        print(f"Fold {fold + 1}/5")
        print(f"{'='*50}")
        
        # 데이터셋 생성
        fold_train_sequences = [train_sequences[i] for i in train_idx]
        fold_train_labels = train_labels[train_idx]
        fold_val_sequences = [train_sequences[i] for i in val_idx]
        fold_val_labels = train_labels[val_idx]
        
        train_dataset = VesselDataset(fold_train_sequences, fold_train_labels, augment=True)
        val_dataset = VesselDataset(fold_val_sequences, fold_val_labels, augment=False)
        
        # 배치 사이즈 동적 설정
        batch_size = min(32, len(fold_train_sequences) // 4)
        batch_size = max(batch_size, 1)
        
        # GPU 최적화된 DataLoader
        train_loader = DataLoader(
            train_dataset, 
            batch_size=batch_size, 
            shuffle=True,
            num_workers=4,  # 멀티 워커 사용
            pin_memory=True,  # GPU 전송 속도 향상
            persistent_workers=True  # 워커 재사용
        )
        val_loader = DataLoader(
            val_dataset, 
            batch_size=batch_size, 
            shuffle=False,
            num_workers=4,
            pin_memory=True,
            persistent_workers=True
        )
        
        # 모델 초기화 및 학습 - input_dim 변경
        model = SimpleLSTM(input_dim=10)
        trained_model, best_threshold = train_model_with_strategy(
            model, train_loader, val_loader, epochs=50, device=device
        )
        
        # 폴드별 모델과 임계값 저장
        torch.save({
            'model_state_dict': trained_model.state_dict(),
            'threshold': best_threshold
        }, f'vessel_model_fold{fold+1}.pth')
        
        best_models.append(trained_model)
        fold_scores.append({'threshold': best_threshold})
    
    print("\n" + "="*50)
    print("Training completed! Evaluating on held-out test set...")
    print("="*50)
    
    # Test set 평가 (앙상블) - GPU 최적화
    test_dataset = VesselDataset(test_sequences, test_labels, augment=False)
    test_loader = DataLoader(
        test_dataset, 
        batch_size=32, 
        shuffle=False,
        num_workers=4,
        pin_memory=True
    )
    
    # 앙상블 예측
    all_test_preds = []
    optimal_thresholds = []
    
    for i, (model, fold_info) in enumerate(zip(best_models, fold_scores)):
        model.eval()
        fold_preds = []
        
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                batch_x = batch_x.to(device)
                
                if AMP_AVAILABLE:
                    # Mixed Precision for inference
                    with autocast():
                        logits = model(batch_x)
                else:
                    logits = model(batch_x)
                
                fold_preds.extend(torch.sigmoid(logits).cpu().numpy())
        
        all_test_preds.append(fold_preds)
        optimal_thresholds.append(fold_info['threshold'])
    
    # 평균 앙상블
    ensemble_preds = np.mean(all_test_preds, axis=0)
    
    # 평균 최적 임계값 사용
    avg_threshold = np.mean(optimal_thresholds)
    print(f"\nAverage optimal threshold from folds: {avg_threshold:.3f}")
    
    # 클래스 분포를 고려한 보정
    # 예측 확률이 너무 높으면 조정 (FALSE가 많아야 함)
    adjusted_preds = ensemble_preds * 0.9  # 살짝 낮춤
    
    # 최종 예측
    ensemble_preds_binary = (adjusted_preds > avg_threshold).astype(int).flatten()
    test_labels_binary = test_labels.astype(int)
    
    # 예측 분포 확인
    print(f"\nPrediction distribution:")
    print(f"TRUE predictions: {ensemble_preds_binary.sum()} ({ensemble_preds_binary.sum()/len(ensemble_preds_binary)*100:.1f}%)")
    print(f"FALSE predictions: {len(ensemble_preds_binary) - ensemble_preds_binary.sum()}")
    
    # 예상 점수 계산
    if ensemble_preds_binary.sum() > 0:
        expected_precision = 0.19  # 테스트셋 TRUE 비율
        pred_true_ratio = ensemble_preds_binary.sum() / len(ensemble_preds_binary)
        adjusted_precision = expected_precision / pred_true_ratio
        expected_f1 = 2 * adjusted_precision / (1 + adjusted_precision)
        print(f"\nIf all TRUE predictions are correct:")
        print(f"Expected precision: {adjusted_precision:.3f}")
        print(f"Expected F1: {expected_f1:.3f}")
    
    # 최종 성능 평가
    print("\nTest Set Performance:")
    print(classification_report(test_labels_binary, ensemble_preds_binary, 
                              target_names=['Non-NK', 'NK vessel']))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(test_labels_binary, ensemble_preds_binary))
    
    # 예측 결과 저장
    test_results = pd.DataFrame({
        'mmsi': test_mmsi,
        'true_label': test_labels_binary,
        'predicted_label': ensemble_preds_binary,
        'prediction_prob': adjusted_preds.flatten(),
        'threshold_used': avg_threshold
    })
    test_results.to_csv('test_predictions.csv', index=False)
    print("\nTest predictions saved to 'test_predictions.csv'")
    
    print("\nTraining and evaluation completed!")
    
    # 최종 제출용 함수 호출 가능
    print("\n" + "="*50)
    print("For competition submission, use the trained models with:")
    print(f"Optimal threshold: {avg_threshold:.3f}")
    print("Remember to adjust predictions to achieve ~19% TRUE ratio")
    print("="*50)


if __name__ == "__main__":
    main()

VESSEL DETECTION MODEL TRAINING

Select mode:
1. Quick test (basic functionality check)
2. Full pipeline test (complete test with small data)
3. Full training (complete training with all data)

Enter mode (1/2/3): 3

Starting full training...


Cache file found: preprocessed_data_cache.pkl
Use cached preprocessed data? (y/n) [default: y]: n
Checking data availability...
Total MMSI in meta_data: 753
Available CSV files: 1299
Valid MMSI (file + label): 640

Class distribution in valid data:
TRUE (NK vessels): 129 (20.2%)
FALSE (non-NK vessels): 511 (79.8%)

Use parallel processing? (y/n) [default: y]: 

Using 1 parallel workers for preprocessing...


Processing vessels:   0%|          | 1/640 [00:12<2:13:54, 12.57s/it]

Large file detected: 286094249 (141.5 MB), using chunk reading...


Processing vessels:   0%|          | 2/640 [00:32<2:58:29, 16.79s/it]

Too many points: 286094249 (2678456 points), sampling to 1000000...


Process ForkPoolWorker-290:
Traceback (most recent call last):
Processing vessels:   0%|          | 2/640 [00:43<3:49:20, 21.57s/it]  File "/usr/local/lib/python3.10/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
  File "/usr/local/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.10/multiprocessing/pool.py", line 131, in worker
    put((job, i, result))
  File "/usr/local/lib/python3.10/multiprocessing/queues.py", line 377, in put
    self._writer.send_bytes(obj)
  File "/usr/local/lib/python3.10/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/local/lib/python3.10/multiprocessing/connection.py", line 405, in _send_bytes
    self._send(buf)



KeyboardInterrupt: 

In [None]:
def predict_test_data(test_meta_path, data_path, model_paths, threshold, device='cuda'):
    """학습된 모델로 테스트 데이터 예측 - 30초 샘플링, 앙상블"""
    
    print("\n" + "="*70)
    print("PREDICTION ON TEST DATA (Ensemble)")
    print("="*70)
    
    # 1. 테스트 메타데이터 로드
    test_meta = pd.read_csv(test_meta_path)
    print(f"Loaded test metadata with {len(test_meta)} vessels")
    
    # 2. 데이터 분석기 초기화 - 30초 샘플링
    class FastVesselAnalyzer(VesselBehaviorAnalyzer):
        def load_and_preprocess(self, mmsi):
            """30초 샘플링 전처리"""
            df = pd.read_csv(os.path.join(self.data_path, f"{mmsi}.csv"))
            
            df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
            df = df.sort_values('timestamp')
            df = df.drop_duplicates(subset=['timestamp'], keep='first')
            df.set_index('timestamp', inplace=True)
            
            time_diff = df.index.to_series().diff().dt.total_seconds()
            median_diff = time_diff.median()
            
            if median_diff > 60:  # 중간값이 60초 초과면 리샘플링
                df = df.resample('15S').mean()  # 30초 간격
                df = df.interpolate(method='linear', limit=3)
            
            df = df.dropna()
            return df
            
        def _find_stationary_periods_vectorized(self, sog):
            """정박 기간 찾기 - 30초 간격 수정"""
            stationary = sog < 0.5
            
            if len(stationary) == 0:
                return {
                    'num_stops': 0,
                    'avg_stop_duration': 0,
                    'longest_stop': 0
                }
            
            padded = np.concatenate([[False], stationary, [False]])
            diff = np.diff(padded.astype(int))
            starts = np.where(diff == 1)[0]
            ends = np.where(diff == -1)[0]
            
            if len(starts) == 0 or len(ends) == 0:
                return {
                    'num_stops': 0,
                    'avg_stop_duration': 0,
                    'longest_stop': 0
                }
            
            if ends[0] < starts[0]:
                ends = ends[1:]
            if len(starts) > len(ends):
                starts = starts[:len(ends)]
            
            durations = (ends - starts) * 30  # 30초 간격으로 변경
            
            return {
                'num_stops': int(len(starts)),
                'avg_stop_duration': float(np.mean(durations)) if len(durations) > 0 else 0,
                'longest_stop': float(np.max(durations)) if len(durations) > 0 else 0
            }
    
    analyzer = FastVesselAnalyzer(data_path, test_meta_path)
    seq_extractor = SequenceFeatureExtractor()
    
    # 3. 사용 가능한 파일 확인
    available_files = set()
    for f in os.listdir(data_path):
        if f.endswith('.csv') and f != 'meta_data.csv':
            available_files.add(f.replace('.csv', ''))
    
    # MMSI 추출
    if 'MMSI' in test_meta.columns:
        test_mmsi_list = test_meta['MMSI'].astype(str).tolist()
    else:
        test_mmsi_list = test_meta['mmsi'].astype(str).tolist()
    
    # 4. 테스트 데이터 전처리
    print("\nProcessing test vessels...")
    test_sequences = []
    test_mmsi_valid = []
    
    for mmsi in tqdm(test_mmsi_list):
        if mmsi not in available_files:
            continue
            
        try:
            vessel_data = analyzer.load_and_preprocess(mmsi)
            
            if len(vessel_data) < 50:  # 학습 때와 동일한 임계값
                continue
            
            sequence_features = seq_extractor.extract_sequence_features(vessel_data)
            test_sequences.append(sequence_features)
            test_mmsi_valid.append(mmsi)
            
        except Exception as e:
            continue
    
    print(f"\nSuccessfully processed {len(test_sequences)} vessels out of {len(test_mmsi_list)}")
    
    # 5. 데이터셋 생성
    if len(test_sequences) == 0:
        print("No valid vessels to predict!")
        results_df = pd.DataFrame({
            'MMSI': test_mmsi_list,
            'prediction': 0,
            'probability': 0.0
        })
        results_df.to_csv('test_predictions_final.csv', index=False)
        return results_df
    
    dummy_labels = np.zeros(len(test_sequences))
    test_dataset = VesselDataset(test_sequences, dummy_labels, augment=False)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    
    # 6. 모델 로드 및 앙상블 예측
    print("\nLoading models and making predictions...")
    all_predictions = []
    
    for i, model_path in enumerate(model_paths):
        if not os.path.exists(model_path):
            print(f"Warning: {model_path} not found, skipping...")
            continue
            
        print(f"Loading model {i+1}/{len(model_paths)}: {model_path}")
        
        # 모델 초기화 및 가중치 로드
        model = SimpleLSTM(input_dim=10).to(device)
        checkpoint = torch.load(model_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.eval()
        
        # 예측
        fold_preds = []
        with torch.no_grad():
            for batch_x, _ in test_loader:
                batch_x = batch_x.to(device)
                logits = model(batch_x)
                probs = torch.sigmoid(logits).cpu().numpy()
                fold_preds.extend(probs)
        
        all_predictions.append(fold_preds)
    
    # 7. 앙상블 평균
    if len(all_predictions) > 0:
        ensemble_probs = np.mean(all_predictions, axis=0).flatten()
    else:
        print("No models loaded successfully!")
        ensemble_probs = np.zeros(len(test_sequences))
    
    # 8. 임계값 적용하여 최종 예측
    predictions = (ensemble_probs > threshold).astype(int)
    
    # 9. 전체 MMSI에 대한 결과 생성
    final_results = []
    for mmsi in test_mmsi_list:
        if mmsi in test_mmsi_valid:
            idx = test_mmsi_valid.index(mmsi)
            final_results.append({
                'MMSI': mmsi,
                'prediction': predictions[idx],
                'probability': ensemble_probs[idx]
            })
        else:
            # 처리되지 않은 선박은 FALSE(0)로 예측
            final_results.append({
                'MMSI': mmsi,
                'prediction': 0,
                'probability': 0.0
            })
    
    results_df = pd.DataFrame(final_results)
    
    # 예측 분포 확인
    print(f"\nPrediction distribution:")
    print(f"TRUE predictions: {(results_df['prediction'] == 1).sum()} ({(results_df['prediction'] == 1).sum()/len(results_df)*100:.1f}%)")
    print(f"FALSE predictions: {(results_df['prediction'] == 0).sum()}")
    
    # 결과 저장
    output_file = 'test_predictions_final.csv'
    results_df.to_csv(output_file, index=False)
    print(f"\nPredictions saved to {output_file}")
    
    return results_df

In [None]:
# 1. 모델 경로 설정 (5개 모두 사용)
model_paths = [f'vessel_model_fold{i}.pth' for i in range(1, 6)]

# 2. 최적 임계값 가져오기
checkpoint = torch.load('vessel_model_fold1.pth')
threshold = checkpoint.get('threshold', 0.5)
print(f"Using threshold: {threshold}")

# 3. 앙상블 예측 실행
predictions = predict_test_data(
    test_meta_path='./meta_test_data.csv',
    data_path='./tracks',
    model_paths=model_paths,
    threshold=threshold,
    device='cuda'
)

# 4. 0,1을 TRUE/FALSE로 변환
df = pd.read_csv('test_predictions_final.csv')
df['prediction'] = df['prediction'].map({0: 'FALSE', 1: 'TRUE'})
df.to_csv('test_predictions_final.csv', index=False)

print("\nPrediction completed!")
print(f"TRUE predictions: {(df['prediction'] == 'TRUE').sum()}")
print(f"FALSE predictions: {(df['prediction'] == 'FALSE').sum()}")

In [48]:
# 예측 시 사용하는 코드
model_paths = './vessel_model_fold4.pth'

In [49]:
checkpoint = torch.load('vessel_model_fold4.pth')
threshold = checkpoint.get('threshold', 0.5)
print(f"Using threshold: {threshold}")

Using threshold: 0.4000000000000001


In [52]:
def predict_test_data_single(test_meta_path, data_path, model_path, threshold, device='cuda'):
    """단일 모델로 테스트 데이터 예측 - 60초 샘플링"""
    
    print("\n" + "="*70)
    print("PREDICTION ON TEST DATA (Single Model)")
    print("="*70)
    
    # 1. 테스트 메타데이터 로드
    test_meta = pd.read_csv(test_meta_path)
    print(f"Loaded test metadata with {len(test_meta)} vessels")
    
    # 2. 데이터 분석기 초기화 - 60초 샘플링
    class FastVesselAnalyzer(VesselBehaviorAnalyzer):
        def load_and_preprocess(self, mmsi):
            """60초 샘플링 전처리"""
            df = pd.read_csv(os.path.join(self.data_path, f"{mmsi}.csv"))
            
            df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
            df = df.sort_values('timestamp')
            df = df.drop_duplicates(subset=['timestamp'], keep='first')
            df.set_index('timestamp', inplace=True)
            
            time_diff = df.index.to_series().diff().dt.total_seconds()
            median_diff = time_diff.median()
            
            if median_diff > 120:
                df = df.resample('60S').mean()
                df = df.interpolate(method='linear', limit=3)
            
            df = df.dropna()
            return df
    
    analyzer = FastVesselAnalyzer(data_path, test_meta_path)
    seq_extractor = SequenceFeatureExtractor()
    
    # 3. 사용 가능한 파일 확인
    available_files = set()
    for f in os.listdir(data_path):
        if f.endswith('.csv') and f != 'meta_data.csv':
            available_files.add(f.replace('.csv', ''))
    
    # MMSI 추출
    if 'MMSI' in test_meta.columns:
        test_mmsi_list = test_meta['MMSI'].astype(str).tolist()
    else:
        test_mmsi_list = test_meta['mmsi'].astype(str).tolist()
    
    # 4. 테스트 데이터 전처리
    print("\nProcessing test vessels...")
    test_sequences = []
    test_mmsi_valid = []
    
    for mmsi in tqdm(test_mmsi_list):
        if mmsi not in available_files:
            continue
            
        try:
            vessel_data = analyzer.load_and_preprocess(mmsi)
            
            if len(vessel_data) < 25:
                continue
            
            sequence_features = seq_extractor.extract_sequence_features(vessel_data)
            test_sequences.append(sequence_features)
            test_mmsi_valid.append(mmsi)
            
        except Exception as e:
            continue
    
    print(f"\nSuccessfully processed {len(test_sequences)} vessels out of {len(test_mmsi_list)}")
    
    # 5. 예측
    if len(test_sequences) == 0:
        print("No valid vessels to predict!")
        results_df = pd.DataFrame({
            'MMSI': test_mmsi_list,
            'prediction': 0,
            'probability': 0.0
        })
        results_df.to_csv('test_predictions_final.csv', index=False)
        return results_df
    
    # 데이터셋 생성
    dummy_labels = np.zeros(len(test_sequences))
    test_dataset = VesselDataset(test_sequences, dummy_labels, augment=False)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    
    # 6. 모델 로드 및 예측
    print(f"\nLoading model: {model_path}")
    model = SimpleLSTM(input_dim=10).to(device)
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    # 예측
    all_probs = []
    with torch.no_grad():
        for batch_x, _ in tqdm(test_loader, desc="Predicting"):
            batch_x = batch_x.to(device)
            logits = model(batch_x)
            probs = torch.sigmoid(logits).cpu().numpy()
            all_probs.extend(probs)
    
    # 7. 결과 정리
    all_probs = np.array(all_probs).flatten()
    predictions = (all_probs > threshold).astype(int)
    
    # 8. 전체 MMSI에 대한 결과 생성
    final_results = []
    for mmsi in test_mmsi_list:
        if mmsi in test_mmsi_valid:
            idx = test_mmsi_valid.index(mmsi)
            final_results.append({
                'MMSI': mmsi,
                'prediction': predictions[idx],
                'probability': all_probs[idx]
            })
        else:
            final_results.append({
                'MMSI': mmsi,
                'prediction': 0,
                'probability': 0.0
            })
    
    results_df = pd.DataFrame(final_results)
    
    # 예측 분포 확인
    print(f"\nPrediction distribution:")
    print(f"TRUE predictions: {(results_df['prediction'] == 1).sum()} ({(results_df['prediction'] == 1).sum()/len(results_df)*100:.1f}%)")
    print(f"FALSE predictions: {(results_df['prediction'] == 0).sum()}")
    
    # 결과 저장
    output_file = 'test_predictions_final.csv'
    results_df.to_csv(output_file, index=False)
    print(f"\nPredictions saved to {output_file}")
    
    return results_df

# 사용 방법
model_path = 'vessel_model_fold4.pth'  # 4번째 fold 모델
checkpoint = torch.load(model_path)
threshold = checkpoint.get('threshold', 0.5)

predictions = predict_test_data_single(
    test_meta_path='./meta_test_data.csv',
    data_path='./tracks',
    model_path=model_path,
    threshold=threshold,
    device='cuda'
)


PREDICTION ON TEST DATA (Single Model)
Loaded test metadata with 300 vessels

Processing test vessels...


100%|██████████| 300/300 [02:47<00:00,  1.79it/s]



Successfully processed 258 vessels out of 300

Loading model: vessel_model_fold4.pth


Predicting: 100%|██████████| 5/5 [00:00<00:00, 100.65it/s]


Prediction distribution:
TRUE predictions: 58 (19.3%)
FALSE predictions: 242

Predictions saved to test_predictions_final.csv





In [53]:
# 한 줄로 처리
pd.read_csv('test_predictions_final.csv').assign(prediction=lambda x: x['prediction'].map({0: 'FALSE', 1: 'TRUE'})).to_csv('test_predictions_final.csv', index=False)

In [37]:
import pandas as pd
import os

# 경로 설정
data_path = './tracks'  # 실제 경로로 변경
meta_path = './meta_data.csv'  # 실제 경로로 변경

# 메타데이터 로드
meta_data = pd.read_csv(meta_path)

print("\n=== Comprehensive TRUE vessel analysis ===")

# 1. 메타데이터의 TRUE 분포
true_vessels = meta_data[meta_data['label'] == 'TRUE']
false_vessels = meta_data[meta_data['label'] == 'FALSE']
print(f"Total vessels in metadata: {len(meta_data)}")
print(f"TRUE vessels: {len(true_vessels)} ({len(true_vessels)/len(meta_data)*100:.1f}%)")
print(f"FALSE vessels: {len(false_vessels)} ({len(false_vessels)/len(meta_data)*100:.1f}%)")

# 2. available files 확인
available_files = set()
for f in os.listdir(data_path):
    if f.endswith('.csv') and f != 'meta_data.csv':
        available_files.add(f.replace('.csv', ''))

print(f"\nTotal CSV files available: {len(available_files)}")

# 3. MMSI 추출 및 valid_mmsi 계산
if 'MMSI' in meta_data.columns:
    meta_mmsi = set(meta_data['MMSI'].astype(str))
else:
    meta_mmsi = set(meta_data['mmsi'].astype(str))

valid_mmsi = available_files & meta_mmsi
print(f"Valid MMSI (file + label): {len(valid_mmsi)}")

# 4. TRUE 선박 파일 존재 여부
true_mmsi_list = true_vessels['MMSI'].astype(str).tolist()
true_with_files = [mmsi for mmsi in true_mmsi_list if os.path.exists(os.path.join(data_path, f"{mmsi}.csv"))]
print(f"\nTRUE vessels with files: {len(true_with_files)}/{len(true_mmsi_list)}")

# 5. valid_mmsi에 포함된 TRUE 선박
true_in_valid = [mmsi for mmsi in true_mmsi_list if mmsi in valid_mmsi]
print(f"TRUE vessels in valid_mmsi: {len(true_in_valid)}")

# 6. 데이터 포인트 확인 (샘플)
if true_with_files:
    print("\nChecking data points for TRUE vessels (first 5):")
    for mmsi in true_with_files[:5]:
        try:
            df = pd.read_csv(os.path.join(data_path, f"{mmsi}.csv"))
            print(f"  MMSI {mmsi}: {len(df)} points")
            
            # 전처리 후 길이도 확인
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.sort_values('timestamp')
            df = df.drop_duplicates(subset=['timestamp'], keep='first')
            df = df.set_index('timestamp')
            df = df.resample('30s').mean()
            df = df.dropna()
            print(f"    After preprocessing: {len(df)} points")
            
        except Exception as e:
            print(f"  MMSI {mmsi}: Error - {e}")

# 7. valid_meta_data에서 TRUE 분포
valid_meta_data = meta_data[meta_data['MMSI'].astype(str).isin(valid_mmsi)]
true_in_valid_meta = (valid_meta_data['label'] == 'TRUE').sum()
print(f"\nIn valid_meta_data:")
print(f"Total: {len(valid_meta_data)}")
print(f"TRUE: {true_in_valid_meta} ({true_in_valid_meta/len(valid_meta_data)*100:.1f}%)")
print(f"FALSE: {len(valid_meta_data) - true_in_valid_meta}")


=== Comprehensive TRUE vessel analysis ===
Total vessels in metadata: 122
TRUE vessels: 0 (0.0%)
FALSE vessels: 0 (0.0%)

Total CSV files available: 1299
Valid MMSI (file + label): 105

TRUE vessels with files: 0/0
TRUE vessels in valid_mmsi: 0

In valid_meta_data:
Total: 105
TRUE: 0 (0.0%)
FALSE: 105


In [38]:
import pandas as pd
import os

# 메타데이터 로드
meta_path = './meta_data.csv'
meta_data = pd.read_csv(meta_path)

print("=== Label Column Analysis ===")

# 1. 컬럼명 확인
print(f"\nColumn names in meta_data:")
print(meta_data.columns.tolist())

# 2. label 컬럼의 고유값 확인
if 'label' in meta_data.columns:
    print(f"\nUnique values in 'label' column:")
    print(meta_data['label'].value_counts())
    print(f"\nData type of 'label': {meta_data['label'].dtype}")
    
    # 샘플 값 확인
    print(f"\nFirst 10 label values (with repr):")
    for i, val in enumerate(meta_data['label'].head(10)):
        print(f"  {i}: {repr(val)} (type: {type(val).__name__})")
        
elif 'result' in meta_data.columns:
    print(f"\nUnique values in 'result' column:")
    print(meta_data['result'].value_counts())
    print(f"\nData type of 'result': {meta_data['result'].dtype}")
    
    print(f"\nFirst 10 result values (with repr):")
    for i, val in enumerate(meta_data['result'].head(10)):
        print(f"  {i}: {repr(val)} (type: {type(val).__name__})")

# 3. 다른 가능한 레이블 컬럼 찾기
print(f"\nSearching for possible label columns:")
for col in meta_data.columns:
    unique_values = meta_data[col].unique()
    if len(unique_values) < 10:  # 카테고리형 컬럼만
        print(f"\n{col}: {unique_values}")

=== Label Column Analysis ===

Column names in meta_data:
['MMSI', 'label']

Unique values in 'label' column:
label
True     68
False    54
Name: count, dtype: int64

Data type of 'label': bool

First 10 label values (with repr):
  0: True (type: bool)
  1: True (type: bool)
  2: True (type: bool)
  3: True (type: bool)
  4: True (type: bool)
  5: True (type: bool)
  6: True (type: bool)
  7: True (type: bool)
  8: True (type: bool)
  9: True (type: bool)

Searching for possible label columns:

label: [ True False]


In [39]:
import pandas as pd
import os

# 경로 설정
data_path = './tracks'
meta_path = './meta_data.csv'

# 메타데이터 로드
meta_data = pd.read_csv(meta_path)

print("\n=== Corrected TRUE vessel analysis ===")

# 1. 메타데이터의 TRUE 분포 (boolean으로)
true_vessels = meta_data[meta_data['label'] == True]  # boolean True
false_vessels = meta_data[meta_data['label'] == False]  # boolean False
print(f"Total vessels in metadata: {len(meta_data)}")
print(f"TRUE vessels: {len(true_vessels)} ({len(true_vessels)/len(meta_data)*100:.1f}%)")
print(f"FALSE vessels: {len(false_vessels)} ({len(false_vessels)/len(meta_data)*100:.1f}%)")

# 2. available files 확인
available_files = set()
for f in os.listdir(data_path):
    if f.endswith('.csv') and f != 'meta_data.csv':
        available_files.add(f.replace('.csv', ''))

print(f"\nTotal CSV files available: {len(available_files)}")

# 3. valid_mmsi 계산
meta_mmsi = set(meta_data['MMSI'].astype(str))
valid_mmsi = available_files & meta_mmsi
print(f"Valid MMSI (file + label): {len(valid_mmsi)}")

# 4. TRUE 선박 파일 존재 여부
true_mmsi_list = true_vessels['MMSI'].astype(str).tolist()
true_with_files = [mmsi for mmsi in true_mmsi_list if mmsi in available_files]
print(f"\nTRUE vessels with files: {len(true_with_files)}/{len(true_mmsi_list)}")

# 5. valid_mmsi에 포함된 TRUE 선박
true_in_valid = [mmsi for mmsi in true_mmsi_list if mmsi in valid_mmsi]
print(f"TRUE vessels in valid_mmsi: {len(true_in_valid)}")

# 6. valid_meta_data에서 TRUE 분포
valid_meta_data = meta_data[meta_data['MMSI'].astype(str).isin(valid_mmsi)]
true_in_valid_meta = (valid_meta_data['label'] == True).sum()  # boolean True
print(f"\nIn valid_meta_data:")
print(f"Total: {len(valid_meta_data)}")
print(f"TRUE: {true_in_valid_meta} ({true_in_valid_meta/len(valid_meta_data)*100:.1f}%)")
print(f"FALSE: {len(valid_meta_data) - true_in_valid_meta}")


=== Corrected TRUE vessel analysis ===
Total vessels in metadata: 122
TRUE vessels: 68 (55.7%)
FALSE vessels: 54 (44.3%)

Total CSV files available: 1299
Valid MMSI (file + label): 105

TRUE vessels with files: 54/68
TRUE vessels in valid_mmsi: 54

In valid_meta_data:
Total: 105
TRUE: 54 (51.4%)
FALSE: 51
