In [1]:
# ===============================
# 0-1. 필수 라이브러리 설치
# ===============================
!pip install lightgbm catboost shap optuna geopandas shapely pyproj fiona pyarrow --quiet
!pip install haversine --quiet

# ===============================
# 0-2. 라이브러리 임포트
# ===============================
import os, re, glob, warnings, joblib, numpy as np, pandas as pd, geopandas as gpd
from datetime import datetime, timedelta
import optuna, lightgbm as lgb, xgboost as xgb
from catboost import CatBoostClassifier
from shapely.geometry import Polygon
from shapely import wkt
from haversine import haversine as hs
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_recall_curve
from sklearn.ensemble import IsolationForest

# 랜덤 시드 고정
SEED = 42
np.random.seed(SEED)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m112.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# ===============================
# 1. Google Drive 마운트
# ===============================
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# ===============================
# 2. 데이터 디렉토리 및 파일 분할
# ===============================
DATA_DIR = '/content/drive/MyDrive/25년 해군 AI 경진대회/dataset/의심선박 훈련용 데이터 셋'
ZONE_DIR = '/content/drive/MyDrive/25년 해군 AI 경진대회/dataset/구역 데이터'

if not os.path.exists(DATA_DIR):
    raise FileNotFoundError(f"데이터 디렉토리를 찾을 수 없습니다: {DATA_DIR}")
if not os.path.exists(ZONE_DIR):
    raise FileNotFoundError(f"구역 디렉토리를 찾을 수 없습니다: {ZONE_DIR}")

csv_files = sorted(glob.glob(os.path.join(DATA_DIR, '*.csv')))
ymd = lambda f: datetime.strptime(re.search(r'(\d{8})', os.path.basename(f)).group(1), '%Y%m%d')
file_dates = [(f, ymd(f)) for f in csv_files]
file_dates.sort(key=lambda x: x[1])

last_date   = file_dates[-1][1]
train_start = last_date - timedelta(days=92)
train_files = [f for f,d in file_dates if d >= train_start]
val_files   = [f for f,d in file_dates if d <  train_start]

print(f"Train files: {len(train_files)} [{os.path.basename(train_files[0])} ~ {os.path.basename(train_files[-1])}]")
print(f"Val   files: {len(val_files)} [{os.path.basename(val_files[0])} ~ {os.path.basename(val_files[-1])}]")

Train files: 93 [20240229.csv ~ 20240531.csv]
Val   files: 273 [20230601.csv ~ 20240228.csv]


In [8]:
# ===============================
# 3. 구역(Zone) 데이터 로드 (수정본)
# ===============================
import glob, warnings, os, re
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon

# 문자열 정규화 함수
def _normalize(s: str) -> str:
    return re.sub(r'[\s_\-]', '', s.lower())

# 컬럼 이름 매핑 헬퍼
def find_col(df: pd.DataFrame, patterns, required: bool=True):
    pats = [_normalize(p) for p in patterns]
    for col in df.columns:
        cn = _normalize(col)
        if any(cn == p or cn.startswith(p) or p in cn for p in pats):
            return col
    if required:
        raise KeyError(f"❌ 필수 컬럼 패턴 {patterns} 에 해당하는 열을 찾지 못했습니다.\n실제 열 목록: {df.columns.tolist()}")
    return None

def load_zones(zone_dir: str) -> gpd.GeoDataFrame:
    """
    zone_dir/*.csv 각 파일에서 LAT/LON 컬럼을 찾아
    하나의 Polygon 으로 만들고, zone_type에는 파일명(확장자 제외) 설정
    """
    zone_files = glob.glob(os.path.join(zone_dir, '*.csv'))
    gdfs = []
    for fp in zone_files:
        df = pd.read_csv(fp)
        try:
            lat_col = find_col(df, ['latitude','lat'])
            lon_col = find_col(df, ['longitude','lon'])
        except KeyError as e:
            warnings.warn(f"⚠️ {os.path.basename(fp)}: 좌표(LAT/LON) 컬럼을 찾지 못해 건너뜁니다. ({e})")
            continue

        coords = list(zip(df[lon_col], df[lat_col]))
        if len(coords) < 3:
            warnings.warn(f"⚠️ {os.path.basename(fp)}: 좌표가 3개 미만입니다. 건너뜁니다.")
            continue

        poly = Polygon(coords)
        zone_type = os.path.splitext(os.path.basename(fp))[0]
        gdf = gpd.GeoDataFrame(
            {'zone_type':[zone_type], 'geometry':[poly]},
            crs='EPSG:4326'
        )
        gdfs.append(gdf)
        print(f"✅ Loaded zone: {zone_type}")

    if not gdfs:
        raise RuntimeError("❌ 유효한 구역(zone) 데이터를 하나도 불러오지 못했습니다.")
    return pd.concat(gdfs, ignore_index=True)

# 호출
zones_gdf = load_zones(ZONE_DIR)
print("Zone types:", zones_gdf['zone_type'].tolist())

✅ Loaded zone: Area_Navy_train
✅ Loaded zone: Area_Near_Sea
✅ Loaded zone: Area_Restrict_zone
✅ Loaded zone: Area_Sea_cable_lv1_poly
✅ Loaded zone: Area_Special_restrict_zone_poly
✅ Loaded zone: Area_Target_Area
Zone types: ['Area_Navy_train', 'Area_Near_Sea', 'Area_Restrict_zone', 'Area_Sea_cable_lv1_poly', 'Area_Special_restrict_zone_poly', 'Area_Target_Area']


In [9]:
# 구역 상수
ANCHOR_ZONE      = 'Area_Near_Sea'
SPECIAL_RESTRICT = 'Area_Special_restrict_zone_poly'
NAVY_TRAIN       = 'Area_Navy_train'
CABLE_ZONE       = 'Area_Sea_cable_lv1_poly'
NO_ENTRY_ZONE    = 'Area_Restrict_zone'

In [10]:
# 2.5. train/validation 파일 리스트 재확인
import glob, re
from datetime import datetime, timedelta

csv_files = sorted(glob.glob(os.path.join(DATA_DIR, '*.csv')))
ymd = lambda f: datetime.strptime(re.search(r'(\d{8})', os.path.basename(f)).group(1), '%Y%m%d')
file_dates = [(f, ymd(f)) for f in csv_files]
file_dates.sort(key=lambda x: x[1])
last_date   = file_dates[-1][1]
train_start = last_date - timedelta(days=92)
train_files = [f for f, d in file_dates if d >= train_start]
val_files   = [f for f, d in file_dates if d <  train_start]
print(f"Train files: {len(train_files)}, Val files: {len(val_files)}")

Train files: 93, Val files: 273


In [11]:
# 4. 원본 통계 피처 집계 함수 (ver3 그대로)
def aggregate_ais(df):
    df = df.copy()
    df.rename(columns={
        find_col(df, ['basedatetime','timestamp','datetime','time']): 'Time',
        find_col(df, ['latitude','lat']): 'Latitude',
        find_col(df, ['longitude','lon']): 'Longitude',
        find_col(df, ['sog','speed']): 'SOG',
        find_col(df, ['cog','course']): 'COG',
        find_col(df, ['mmsi']): 'MMSI'
    }, inplace=True)
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
    df.dropna(subset=['Time','Latitude','Longitude'], inplace=True)
    df.sort_values(['MMSI','Time'], inplace=True)
    pts = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude), crs='EPSG:4326')
    pts = gpd.sjoin(pts, zones_gdf[['zone_type','geometry']], how='left', predicate='within')
    for z in zones_gdf['zone_type'].unique():
        pts[f"{z}_flag"] = (pts['zone_type']==z).astype(int)
    pts['time_diff'] = pts.groupby('MMSI')['Time'].diff().dt.total_seconds().fillna(0)

    def _agg(g):
        dur = g['time_diff']
        out = {
            'lowspd_out_anchor_sec': ((g[f'{ANCHOR_ZONE}_flag']==1)&(g['SOG']<=5)).dot(dur),
            'nomove_out_anchor_sec': ((g[f'{ANCHOR_ZONE}_flag']==1)&(g['SOG']==0)).dot(dur),
            'ais_off_cnt_out_anchor': int(((g[f'{ANCHOR_ZONE}_flag']==1)&(g['SOG'].isna())).sum()),
            'restrict_train_sec': ((g[f'{NAVY_TRAIN}_flag']==1)).dot(dur),
            'sharp_turn_cnt': int((g['COG'].diff().abs()>30).sum()),
            'cable_lowspd_sec': ((g[f'{CABLE_ZONE}_flag']==1)&(g['SOG']<=5)).dot(dur),
            'no_entry_flag': int((g[f'{NO_ENTRY_ZONE}_flag']==1).any()),
        }
        # zone flag 그대로 가져오기
        for z in zones_gdf['zone_type'].unique():
            out[f'{z}_flag'] = g[f'{z}_flag'].max()
        return pd.Series(out)

    agg = pts.groupby('MMSI', group_keys=False).apply(_agg).reset_index()
    if 'result' in df.columns:
        agg = agg.merge(df[['MMSI','result']].drop_duplicates('MMSI'), on='MMSI', how='left')
    return agg

def rule_score(r):
    score = 0
    if r['lowspd_out_anchor_sec']>=3600 or r['nomove_out_anchor_sec']>=3600: score+=.20
    if r['ais_off_cnt_out_anchor']>=3:      score+=.15
    if r['restrict_train_sec']>=3600:       score+=.20
    if r['sharp_turn_cnt']>=1:              score+=.15
    if r['cable_lowspd_sec']>=600:          score+=.15
    if r['no_entry_flag']==1:               score+=.15
    return score*100

def build_dataset(file_list):
    raw = pd.concat([pd.read_csv(f) for f in file_list], ignore_index=True)
    df  = aggregate_ais(raw)
    df['rule_score'] = df.apply(rule_score, axis=1)
    df['target'] = df['result'].map({'TRUE':1,'True':1,True:1,1:1,'FALSE':0,'False':0,False:0,0:0})
    num_cols = df.select_dtypes(include='number').columns
    df[num_cols] = df[num_cols].replace([np.inf,-np.inf], np.nan).fillna(0)
    return df

In [12]:
# 5. Train/Val 데이터 집계
train_df = build_dataset(train_files)
val_df   = build_dataset(val_files)
print("Train:", train_df.shape, "Val:", val_df.shape)

  agg = pts.groupby('MMSI', group_keys=False).apply(_agg).reset_index()
  agg = pts.groupby('MMSI', group_keys=False).apply(_agg).reset_index()


Train: (30329, 17) Val: (44155, 17)


In [13]:
# 6. 추가 파생변수 함수 정의
def rename_raw(df):
    df = df.copy()
    df.rename(columns={
        find_col(df,['basedatetime','timestamp','datetime','time']): 'Time',
        find_col(df,['latitude','lat']): 'Latitude',
        find_col(df,['longitude','lon']): 'Longitude',
        find_col(df,['sog','speed']): 'SOG',
        find_col(df,['cog','course']): 'COG',
        find_col(df,['mmsi']): 'MMSI'
    }, inplace=True)
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
    return df[['MMSI','Time','Latitude','Longitude','SOG','COG']]

def add_datetime_features(df):
    return df.assign(
        hour=df['Time'].dt.hour,
        is_weekend=(df['Time'].dt.weekday>=5).astype(int)
    )[['MMSI','hour','is_weekend']]

def add_distance_features(df):
    df = df.sort_values(['MMSI','Time'])
    df[['lon_prev','lat_prev']] = df.groupby('MMSI')[['Longitude','Latitude']].shift()
    df['dist'] = df.apply(
        lambda x: hs((x['lat_prev'],x['lon_prev']), (x['Latitude'],x['Longitude'])) if pd.notnull(x['lon_prev']) else 0,
        axis=1
    )
    return df.groupby('MMSI')['dist'].agg(
        total_dist='sum', mean_dist='mean', max_dist='max', std_dist='std'
    ).reset_index()

def add_speed_change_features(df):
    df = df.sort_values(['MMSI','Time'])
    df['sog_prev'] = df.groupby('MMSI')['SOG'].shift()
    df['delta_sog'] = (df['SOG'] - df['sog_prev']).abs().fillna(0)
    return df.groupby('MMSI')['delta_sog'].agg(
        mean_delta_sog='mean', max_delta_sog='max', std_delta_sog='std'
    ).reset_index()

def add_stop_event_features(df, stop_thresh=0.5):
    df['is_stop'] = (df['SOG']<=stop_thresh).astype(int)
    df['time_diff'] = df.groupby('MMSI')['Time'].diff().dt.total_seconds().fillna(0)
    df['stop_sec']  = df['is_stop'] * df['time_diff']
    return df.groupby('MMSI')['stop_sec'].agg(total_stop_sec='sum').reset_index()

def add_anomaly_score_features(df):
    num_cols = df.select_dtypes(include='number').columns.drop(['MMSI','target'], errors='ignore')
    iso = IsolationForest(n_estimators=50, random_state=SEED)
    df['anomaly'] = iso.fit_predict(df[num_cols].fillna(0))
    return df.groupby('MMSI')['anomaly'].agg(mean_anomaly='mean', std_anomaly='std').reset_index()

In [14]:
# 7. Raw 데이터 파생 피처 집계
raw_train = pd.concat([pd.read_csv(f) for f in train_files], ignore_index=True)
raw_val   = pd.concat([pd.read_csv(f) for f in val_files], ignore_index=True)
raw_train = rename_raw(raw_train)
raw_val   = rename_raw(raw_val)

dt_tr   = add_datetime_features(raw_train)
dist_tr = add_distance_features(raw_train)
sog_tr  = add_speed_change_features(raw_train)
stop_tr = add_stop_event_features(raw_train)
ano_tr  = add_anomaly_score_features(train_df)

dt_val   = add_datetime_features(raw_val)
dist_val = add_distance_features(raw_val)
sog_val  = add_speed_change_features(raw_val)
stop_val = add_stop_event_features(raw_val)
ano_val  = add_anomaly_score_features(val_df)

In [15]:
# 8. 피처 통합 및 전처리
train = (train_df
         .merge(dt_tr,    on='MMSI', how='left')
         .merge(dist_tr,  on='MMSI', how='left')
         .merge(sog_tr,   on='MMSI', how='left')
         .merge(stop_tr,  on='MMSI', how='left')
         .merge(ano_tr,   on='MMSI', how='left'))
val   = (val_df
         .merge(dt_val,   on='MMSI', how='left')
         .merge(dist_val, on='MMSI', how='left')
         .merge(sog_val,  on='MMSI', how='left')
         .merge(stop_val, on='MMSI', how='left')
         .merge(ano_val,  on='MMSI', how='left'))

train.fillna(0, inplace=True)
val.fillna(0, inplace=True)

X_train = train.drop(columns=['MMSI','result','target'])
y_train = train['target']
X_val   = val.drop(columns=['MMSI','result','target'])
y_val   = val['target']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
vt     = VarianceThreshold(threshold=0.01)
X_train = vt.fit_transform(X_train)
X_val   = vt.transform(X_val)

In [19]:
# ===============================
# 9. Optuna를 이용한 LightGBM 하이퍼파라미터 튜닝 (최종 수정본)
# ===============================
import lightgbm as lgb
import optuna

# Pruner 설정 (중간 성능이 나쁘면 빨리 중단)
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5, interval_steps=1)

def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'seed': SEED,
        'num_leaves': trial.suggest_int('num_leaves', 16, 64),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
    }
    dtrain = lgb.Dataset(X_train, label=y_train)

    cv_res = lgb.cv(
        param,
        dtrain,
        num_boost_round=200,    # 부스팅 라운드 축소
        nfold=3,                # CV fold 축소
        stratified=True,
        seed=SEED,
        callbacks=[
            lgb.early_stopping(stopping_rounds=30),  # 조기종료
            lgb.log_evaluation(period=0)
        ]
    )

    # metric key 자동 탐색
    # 예: 'auc-mean' 혹은 'auc-mean' 등
    metric_key = [k for k in cv_res.keys() if 'mean' in k and 'auc' in k.lower()]
    if not metric_key:
        raise ValueError(f"No AUC metric in CV result: {cv_res.keys()}")
    return max(cv_res[metric_key[0]])

study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=SEED),
    pruner=pruner
)
study.optimize(objective, n_trials=50, show_progress_bar=True)

best_params = study.best_params
print("Best LightGBM params:", best_params)

[I 2025-05-28 14:19:30,638] A new study created in memory with name: no-name-07f79df8-e73c-4f18-b325-c5d7542d7b06


  0%|          | 0/50 [00:00<?, ?it/s]

Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[200]	cv_agg's valid auc: 0.965789 + 0.000649236
[I 2025-05-28 14:19:47,540] Trial 0 finished with value: 0.965789098540395 and parameters: {'num_leaves': 34, 'learning_rate': 0.07969454818643935, 'feature_fraction': 0.892797576724562, 'bagging_fraction': 0.8394633936788146, 'bagging_freq': 1}. Best is trial 0 with value: 0.965789098540395.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[200]	cv_agg's valid auc: 0.838713 + 0.000296032
[I 2025-05-28 14:20:03,154] Trial 1 finished with value: 0.8387134530309693 and parameters: {'num_leaves': 23, 'learning_rate': 0.0013066739238053278, 'feature_fraction': 0.9464704583099741, 'bagging_fraction': 0.8404460046972835, 'bagging_freq': 4}. Best is trial 0 with value: 0.965789098540395.
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Be

In [24]:
from xgboost import XGBClassifier, callback as xgb_callback

# 10. 5-Fold CV 스태킹 앙상블 (수정본)
NFOLDS = 5
skf    = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)
oof    = np.zeros(len(X_train))
val_oof= np.zeros(len(X_val))
base_models = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_train, y_train)):
    X_tr, y_tr = X_train[tr_idx], y_train.iloc[tr_idx]
    X_va, y_va = X_train[va_idx], y_train.iloc[va_idx]

    # LightGBM
    lgbm = lgb.LGBMClassifier(**best_params, n_estimators=1000)
    lgbm.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric='auc',
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=0)
        ]
    )
    oof[va_idx] += lgbm.predict_proba(X_va)[:,1]
    val_oof    += lgbm.predict_proba(X_val)[:,1] / NFOLDS
    base_models.append(('lgbm_fold%d' % fold, lgbm))

    # XGBoost (unchanged)
    xgbm = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='auc',
    seed=SEED,
    n_estimators=200
)
# eval_set, early_stopping 제거
    xgbm.fit(X_tr, y_tr)

    oof[va_idx] += xgbm.predict_proba(X_va)[:,1]
    val_oof    += xgbm.predict_proba(X_val)[:,1] / NFOLDS
    base_models.append((f'xgbm_fold{fold}', xgbm))

    # CatBoost (unchanged)
    cb = CatBoostClassifier(verbose=0, random_seed=SEED)
    cb.fit(
        X_tr, y_tr,
        eval_set=(X_va, y_va),
        early_stopping_rounds=50
    )
    oof[va_idx] += cb.predict_proba(X_va)[:,1]
    val_oof    += cb.predict_proba(X_val)[:,1] / NFOLDS
    base_models.append(('catb_fold%d' % fold, cb))

oof    /= 3
val_oof/= 3

stack_X     = oof.reshape(-1,1)
stack_val_X = val_oof.reshape(-1,1)
meta = LogisticRegression()
meta.fit(stack_X, y_train)
final_val = meta.predict_proba(stack_val_X)[:,1]



Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.999599	valid_0's binary_logloss: 0.0396735


Parameters: { "use_label_encoder" } are not used.



Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.999525	valid_0's binary_logloss: 0.0392639


Parameters: { "use_label_encoder" } are not used.



Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.999494	valid_0's binary_logloss: 0.0401214


Parameters: { "use_label_encoder" } are not used.



Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.999467	valid_0's binary_logloss: 0.0397791


Parameters: { "use_label_encoder" } are not used.



Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.999537	valid_0's binary_logloss: 0.0395972


Parameters: { "use_label_encoder" } are not used.



In [25]:
# 11. Threshold 최적화 (F1 기준)
prec, rec, thr = precision_recall_curve(y_val, final_val)
f1s = 2*prec*rec/(prec+rec+1e-9)
best_idx = np.argmax(f1s)
best_th  = thr[best_idx]
print(f"Optimal threshold (F1): {best_th:.4f}")

Optimal threshold (F1): 0.0502


In [26]:
# 12. 파이프라인 저장
pipeline = {
    'scaler':     scaler,
    'var_thresh': vt,
    'base_models': base_models,
    'meta_model':  meta,
    'threshold':   best_th
}
joblib.dump(pipeline, '/content/drive/MyDrive/25년 해군 AI 경진대회/model_result/ver4_pipeline.pkl')
print("Saved pipeline to ver4_pipeline.pkl")

Saved pipeline to ver4_pipeline.pkl


In [33]:
import os
import joblib
import pandas as pd
import numpy as np

# 1. 경로 설정
DATA_DIR        = '/content/drive/MyDrive/25년 해군 AI 경진대회/dataset/의심선박 훈련용 데이터 셋'
PIPELINE_PATH   = '/content/drive/MyDrive/25년 해군 AI 경진대회/model_result/ver4_pipeline.pkl'
NEW_FILE        = '/content/20240701(1번문제).csv'
SUBMISSION_FILE = '/content/drive/MyDrive/25년 해군 AI 경진대회/model_result/submission_ver4model.csv'

# 2. 파이프라인 로드
pipeline     = joblib.load(PIPELINE_PATH)
scaler       = pipeline['scaler']
var_thresh   = pipeline['var_thresh']
base_models  = pipeline['base_models']
meta_model   = pipeline['meta_model']
threshold    = pipeline['threshold']

# 3. 새 AIS 데이터 읽기
raw_new = pd.read_csv(NEW_FILE)

# 4. 피처 엔지니어링 (이미 정의된 함수 재사용)
agg       = aggregate_ais(raw_new)
agg['rule_score'] = agg.apply(rule_score, axis=1)

raw_ren   = rename_raw(raw_new)
dt_feat   = add_datetime_features(raw_ren)
dist_feat = add_distance_features(raw_ren)
sog_feat  = add_speed_change_features(raw_ren)
stop_feat = add_stop_event_features(raw_ren)
ano_feat  = add_anomaly_score_features(agg)

# 5. 피처 병합 — 반드시 aggregate_ais()가 뽑아낸 원본 피처(플래그 포함)와 rule_score를 모두 포함
features = (
    agg
    .merge(dt_feat,    on='MMSI', how='left')
    .merge(dist_feat,  on='MMSI', how='left')
    .merge(sog_feat,   on='MMSI', how='left')
    .merge(stop_feat,  on='MMSI', how='left')
    .merge(ano_feat,   on='MMSI', how='left')
)
# agg already contains lowspd_out_anchor_sec, nomove_out_anchor_sec,
# ais_off_cnt_out_anchor, restrict_train_sec, sharp_turn_cnt,
# cable_lowspd_sec, no_entry_flag, plus all "{zone}_flag" cols, and rule_score.

# 6. 모델 입력 배열 준비 — DataFrame이 아니라 numpy array로 변환하여
#    scaler.transform 시 feature‐name checking을 우회합니다.
X = features.drop(columns=['MMSI']).fillna(0).to_numpy()

# 7. 스케일링 & 분산 임계치 제거
X_scaled = scaler.transform(X)
X_sel    = var_thresh.transform(X_scaled)

# 8. 예측 (Base 모델 앙상블 → Meta 모델 → Threshold)
pred_sum = np.zeros(X_sel.shape[0])
for _, model in base_models:
    pred_sum += model.predict_proba(X_sel)[:,1]
pred_avg     = pred_sum / len(base_models)
meta_input   = pred_avg.reshape(-1,1)
meta_pred    = meta_model.predict_proba(meta_input)[:,1]
results_bool = meta_pred >= threshold
results_str  = np.where(results_bool, 'TRUE', 'FALSE')

# 9. 제출용 CSV 생성
submission = pd.DataFrame({
    'MMSI':   features['MMSI'],
    'result': results_str
})
submission.to_csv(SUBMISSION_FILE, index=False)
print("✅ Submission saved to", SUBMISSION_FILE)

  agg = pts.groupby('MMSI', group_keys=False).apply(_agg).reset_index()


✅ Submission saved to /content/drive/MyDrive/25년 해군 AI 경진대회/model_result/submission_ver4model.csv
