In [1]:
# ===============================
# 0-1. 필수 라이브러리 설치
# ===============================
!pip install lightgbm catboost shap optuna geopandas shapely pyproj fiona pyarrow --quiet

# ===============================
# 0-2. Google Drive 마운트
# ===============================
from google.colab import drive
drive.mount('/content/drive')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m110.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDrive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# ===============================
# 1. 모듈 임포트
# ===============================
import os, re, glob, warnings, joblib
import numpy as np
import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta
from shapely.geometry import Polygon
from shapely import wkt

from sklearn.preprocessing    import RobustScaler
from sklearn.model_selection   import StratifiedKFold
from sklearn.metrics           import accuracy_score
from sklearn.ensemble          import StackingClassifier, IsolationForest
from sklearn.feature_selection import VarianceThreshold

import optuna
from lightgbm import LGBMClassifier, early_stopping as lgb_es, log_evaluation as lgb_log
from xgboost  import XGBClassifier
from catboost import CatBoostClassifier

warnings.filterwarnings('ignore')
SEED    = 42
NTRIAL  = 100
NSPLIT  = 5

In [3]:
# ===============================
# 2. 상수 정의
# ===============================
DATA_DIR = '/content/drive/MyDrive/25년 해군 AI 경진대회/dataset/의심선박 훈련용 데이터 셋'
ZONE_DIR = '/content/drive/MyDrive/25년 해군 AI 경진대회/dataset/구역 데이터'
OUT_DIR  = '/content/drive/MyDrive/25년 해군 AI 경진대회/model_result'
os.makedirs(OUT_DIR, exist_ok=True)

In [8]:
# ===============================
# 3. 구역 데이터 로드 함수
# ===============================
def detect_wkt_col(df):
    for c in df.columns:
        if df[c].dtype == 'object':
            sample = df[c].dropna().astype(str).str.upper()
            if sample.str.startswith(('POLYGON','MULTIPOLYGON','LINESTRING')).any():
                return c
    return None

def load_zones(zone_dir):
    files = glob.glob(os.path.join(zone_dir, '*'))
    gdfs  = []
    for fp in files:
        ext = os.path.splitext(fp)[1].lower()
        try:
            if ext in ['.shp','.geojson','.json','.gpkg','.kml']:
                g = gpd.read_file(fp).to_crs('EPSG:4326')
            elif ext=='.csv':
                df = pd.read_csv(fp)
                wcol = detect_wkt_col(df)
                if wcol:
                    df['geometry'] = df[wcol].apply(wkt.loads)
                    g = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
                else:
                    lon = [c for c in df.columns if c.lower()=='lon'][0]
                    lat = [c for c in df.columns if c.lower()=='lat'][0]
                    gid = [c for c in df.columns if c not in [lon,lat]][0]
                    df['_gid_'] = df[gid]
                    gb = df.groupby('_gid_')
                    polys = (gb.apply(lambda g: Polygon(list(zip(g[lon], g[lat]))))
                            .reset_index(name='geometry'))
                    g = gpd.GeoDataFrame(polys, geometry='geometry', crs='EPSG:4326')
            else:
                continue
            g['zone_type'] = re.sub(r'\.\w+|area_','',os.path.basename(fp),flags=re.I)
            gdfs.append(g)
        except Exception as e:
            print(f"[Warn] {fp} : {e}")
    if not gdfs:
        raise RuntimeError("구역 데이터를 찾을 수 없습니다.")
    zones = pd.concat(gdfs, ignore_index=True)
    print('Zone types:', zones['zone_type'].unique())
    return zones

zones_gdf = load_zones(ZONE_DIR)

# 영역별 플래그 네이밍
ANCHOR_ZONE        = 'Near_Sea'
SPECIAL_ZONE       = 'Special_restrict_zone_poly'
NAVY_TRAIN_ZONE    = 'Navy_train'
CABLE_ZONE         = 'Sea_cable_lv1_poly'
NO_ENTRY_ZONE      = 'Restrict_zone'

Zone types: ['Navy_train' 'Near_Sea' 'Restrict_zone' 'Sea_cable_lv1_poly'
 'Special_restrict_zone_poly' 'Target_Area']


In [9]:
# ===============================
# 4. AIS 집계·피처 엔지니어링
# ===============================
def find_col(df, patterns, required=True):
    def norm(s): return re.sub(r'[\s_\-]','',s.lower())
    pats = [norm(p) for p in patterns]
    for c in df.columns:
        cn = norm(c)
        if any(cn==p or cn.startswith(p) or p in cn for p in pats):
            return c
    if required: raise KeyError(f"필수 컬럼 {patterns} 를 찾을 수 없습니다: {df.columns.tolist()}")
    return None

def aggregate_ais(df):
    df = df.copy()
    df.rename(columns={
        find_col(df,['basedatetime','timestamp','datetime','time']):'Time',
        find_col(df,['latitude','lat']):'Latitude',
        find_col(df,['longitude','lon']):'Longitude',
        find_col(df,['sog','speed']):'SOG',
        find_col(df,['cog','course']):'COG',
        find_col(df,['mmsi']):'MMSI'
    }, inplace=True)
    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
    df.dropna(subset=['Time','Latitude','Longitude'], inplace=True)
    df.sort_values(['MMSI','Time'], inplace=True)
    gdf = gpd.GeoDataFrame(df,
            geometry=gpd.points_from_xy(df.Longitude, df.Latitude),
            crs='EPSG:4326')
    pts = gpd.sjoin(gdf, zones_gdf[['zone_type','geometry']],
                   how='left', predicate='within')
    # zone flag 추가
    for zt in zones_gdf['zone_type'].unique():
        pts[f'{zt}_flag'] = (pts['zone_type']==zt).astype(int)
    pts.drop(columns=['index_right','zone_type','geometry'], inplace=True)

    def _agg(g):
        times = g['Time'].values
        gaps = np.diff(times).astype('timedelta64[s]').astype(float)
        gaps = np.insert(gaps, 0, 1.0)  # 첫 포인트는 1초
        dur  = gaps.copy()
        dist = []
        # 거리 계산
        lat = np.radians(g['Latitude']); lon = np.radians(g['Longitude'])
        for i in range(1,len(g)):
            dlat = lat.iloc[i]-lat.iloc[i-1]
            dlon = lon.iloc[i]-lon.iloc[i-1]
            a = np.sin(dlat/2)**2 + np.cos(lat.iloc[i])*np.cos(lat.iloc[i-1])*np.sin(dlon/2)**2
            dist.append(2*6371*np.arcsin(np.sqrt(a)))
        dist = [0.0] + dist

        cog_diff = np.abs(np.mod(g['COG'].diff().fillna(0)+180,360)-180)
        acc = g['SOG'].diff().fillna(0) / gaps

        res = {
            'num_pts':len(g),
            'traj_dur': sum(gaps),
            'mean_sog': g['SOG'].mean(),
            'std_sog':  g['SOG'].std(),
            'max_sog':  g['SOG'].max(),
            'min_sog':  g['SOG'].min(),
            'median_sog': g['SOG'].median(),
            'sog_iqr': g['SOG'].quantile(0.75)-g['SOG'].quantile(0.25),
            'mean_cog_diff': cog_diff.mean(),
            'std_cog_diff':  cog_diff.std(),
            'total_dist': sum(dist),
            'mean_acc':   acc.mean(),
            'max_acc':    acc.max(),
            'lowspd_out_anchor_sec':
                ((g['SOG']<=5)&(g[f'{ANCHOR_ZONE}_flag']==0)).dot(dur),
            'nomove_out_anchor_sec':
                ((np.array(dist)<0.05)&(g[f'{ANCHOR_ZONE}_flag']==0)).dot(dur),
            'ais_off_cnt_out_anchor':
                ((gaps>1800)&(g[f'{ANCHOR_ZONE}_flag']==0)).sum(),
            'restrict_train_sec':
                ((g[[f'{SPECIAL_ZONE}_flag',f'{NAVY_TRAIN_ZONE}_flag']].max(axis=1)==1)).dot(dur),
            'sharp_turn_cnt': ((cog_diff>=90)&(g[f'{ANCHOR_ZONE}_flag']==0)).sum(),
            'cable_lowspd_sec':
                ((g[f'{CABLE_ZONE}_flag']==1)&(g['SOG']<=5)).dot(dur),
            'no_entry_flag': int((g[f'{NO_ENTRY_ZONE}_flag']==1).any())
        }
        # zone presence 플래그 추가
        for zt in zones_gdf['zone_type'].unique():
            res[f'{zt}_flag'] = g[f'{zt}_flag'].max()
        return pd.Series(res)

    agg = pts.groupby('MMSI', group_keys=False).apply(_agg).reset_index()
    # 레이블 병합
    if 'result' in df.columns:
        agg = agg.merge(df[['MMSI','result']].drop_duplicates('MMSI'),
                        on='MMSI', how='left')
    return agg

# 손기능 기반 rule_score
def rule_score(r):
    score=0
    if (r['lowspd_out_anchor_sec']>=3600)|(r['nomove_out_anchor_sec']>=3600): score+=.20
    if r['ais_off_cnt_out_anchor']>=3:      score+=.15
    if r['restrict_train_sec']>=3600:       score+=.20
    if r['sharp_turn_cnt']>=1:              score+=.15
    if r['cable_lowspd_sec']>=600:          score+=.15
    if r['no_entry_flag']==1:               score+=.15
    return score*100

# 데이터셋 빌드
def build_dataset(file_list):
    raw = pd.concat([pd.read_csv(f) for f in file_list], ignore_index=True)
    df  = aggregate_ais(raw)
    df['rule_score'] = df.apply(rule_score, axis=1)
    df['target']     = df['result'].map({'TRUE':1,'True':1,True:1,1:1,
                                        'FALSE':0,'False':0,False:0,0:0})
    # 무한대 제거
    num = df.select_dtypes('number').columns
    df[num] = df[num].replace([np.inf,-np.inf], np.nan)
    return df

In [10]:
# ===============================
# 5. 학습/검증용 데이터 준비
# ===============================
# 날짜별 파일 분할(마지막 92일 학습, 이전 검증)
csv_files   = sorted(glob.glob(os.path.join(DATA_DIR,'*.csv')))
get_date    = lambda f: datetime.strptime(re.search(r'(\d{8})',os.path.basename(f)).group(1),'%Y%m%d')
file_dates  = sorted([(f,get_date(f)) for f in csv_files], key=lambda x:x[1])
last_date   = file_dates[-1][1]
train_files = [f for f,d in file_dates if d>= last_date-timedelta(days=92)]
val_files   = [f for f,d in file_dates if d<  last_date-timedelta(days=92)]

print('Train:', len(train_files), train_files[0][-12:-4],'~',train_files[-1][-12:-4])
print('Val  :', len(val_files),   val_files[0][-12:-4],'~',val_files[-1][-12:-4])

print('⏳ Build Train'); train_df = build_dataset(train_files)
print('⏳ Build Val  '); val_df   = build_dataset(val_files)
print('Shapes:', train_df.shape, val_df.shape)

# 피처·레이블 분리
feat_cols = [c for c in train_df.columns if c not in ('MMSI','result','target')]
X_tr = train_df[feat_cols].values; y_tr = train_df['target'].astype(int).values
X_va = val_df[feat_cols].values; y_va = val_df['target'].astype(int).values

Train: 93 20240229 ~ 20240531
Val  : 273 20230601 ~ 20240228
⏳ Build Train
⏳ Build Val  
Shapes: (30329, 30) (44155, 30)


In [11]:
# ===============================
# 6. 이상치 제거 & 특징 선택
# ===============================
# 6-1) 이상치 제거
iso = IsolationForest(contamination=0.01, random_state=SEED)
mask = iso.fit_predict(X_tr) == 1
X_tr, y_tr = X_tr[mask], y_tr[mask]

# 6-2) 스케일링 → 분산영 특성 제거
scaler = RobustScaler()
X_tr = scaler.fit_transform(X_tr)
X_va = scaler.transform(X_va)

vt = VarianceThreshold(threshold=0.0)
X_tr = vt.fit_transform(X_tr)
X_va = vt.transform(X_va)

# 선택된 피처 이름 리스트
selected_feats = [feat_cols[i] for i, keep in enumerate(vt.get_support()) if keep]
print(f"Selected {len(selected_feats)}/{len(feat_cols)} features")

Selected 23/27 features


In [13]:
# ===============================
# 7. 하이퍼파라미터 튜닝
# ===============================
skf = StratifiedKFold(n_splits=NSPLIT, shuffle=True, random_state=SEED)
pos_weight = (y_tr==0).sum() / (y_tr==1).sum()

def tune_lgb(trial):
    params = {
        'objective':'binary','metric':'binary_logloss','seed':SEED,'is_unbalance':True,
        'learning_rate':trial.suggest_float('lr',0.01,0.3,log=True),
        'num_leaves':trial.suggest_int('num_leaves',16,128,log=True),
        'max_depth':trial.suggest_int('max_depth',3,12),
        'feature_fraction':trial.suggest_float('feature_fraction',0.5,1.0),
        'bagging_fraction':trial.suggest_float('bagging_fraction',0.5,1.0),
        'bagging_freq':trial.suggest_int('bagging_freq',1,10),
        'lambda_l1':trial.suggest_float('lambda_l1',0.0,5.0),
        'lambda_l2':trial.suggest_float('lambda_l2',0.0,5.0),
    }
    cv = []
    for tr_i, va_i in skf.split(X_tr,y_tr):
        m = LGBMClassifier(**params, n_estimators=400, random_state=SEED)
        m.fit(
            X_tr[tr_i], y_tr[tr_i],
            eval_set=[(X_tr[va_i],y_tr[va_i])],
            callbacks=[lgb_es(stopping_rounds=30), lgb_log(period=0)]
        )
        preds = (m.predict_proba(X_tr[va_i])[:,1]>0.5).astype(int)
        cv.append( accuracy_score(y_tr[va_i], preds) )
    return 1 - np.mean(cv)

In [18]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

def tune_xgb(trial):
    params = {
        'objective':         'binary:logistic',
        'learning_rate':     trial.suggest_float('lr', 0.01, 0.3, log=True),
        'max_depth':         trial.suggest_int('max_depth', 3, 12),
        'subsample':         trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree':  trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma':             trial.suggest_float('gamma', 0.0, 5.0),
        'min_child_weight':  trial.suggest_int('min_child_weight', 1, 20),
        'reg_lambda':        trial.suggest_float('reg_lambda', 0.0, 5.0),
        'scale_pos_weight':  pos_weight,
        'n_estimators':      400,
        'tree_method':       'hist',
        'verbosity':         0,
        'random_state':      SEED
    }
    cv_scores = []
    for tr_idx, va_idx in skf.split(X_tr, y_tr):
        m = XGBClassifier(**params)
        # 단순 학습만
        m.fit(X_tr[tr_idx], y_tr[tr_idx])
        preds = (m.predict_proba(X_tr[va_idx])[:, 1] > 0.5).astype(int)
        cv_scores.append( accuracy_score(y_tr[va_idx], preds) )
    return 1.0 - np.mean(cv_scores)

In [15]:
def tune_cat(trial):
    params = {
        'depth':trial.suggest_int('depth',4,10),
        'learning_rate':trial.suggest_float('lr',0.01,0.3,log=True),
        'l2_leaf_reg':trial.suggest_float('l2_leaf_reg',1.0,10.0,log=True),
        'bagging_temperature':trial.suggest_float('bagging_temperature',0.0,1.0),
        'border_count':trial.suggest_int('border_count',32,255),
        'iterations':400,'loss_function':'Logloss','eval_metric':'Accuracy',
        'random_seed':SEED,'verbose':False,'auto_class_weights':'Balanced'
    }
    cv=[]
    for tr_i, va_i in skf.split(X_tr,y_tr):
        m = CatBoostClassifier(**params)
        m.fit(
            X_tr[tr_i], y_tr[tr_i],
            eval_set=(X_tr[va_i], y_tr[va_i]),
            early_stopping_rounds=30, verbose=False
        )
        preds = (m.predict_proba(X_tr[va_i])[:,1]>0.5).astype(int)
        cv.append( accuracy_score(y_tr[va_i], preds) )
    return 1 - np.mean(cv)

# Optuna 실행
study_lgb = optuna.create_study(direction='minimize'); study_lgb.optimize(tune_lgb, n_trials=NTRIAL)
study_xgb = optuna.create_study(direction='minimize'); study_xgb.optimize(tune_xgb, n_trials=NTRIAL)
study_cat = optuna.create_study(direction='minimize'); study_cat.optimize(tune_cat, n_trials=NTRIAL)

print("LGB Best Acc:", 1-study_lgb.best_value, study_lgb.best_params)
print("XGB Best Acc:", 1-study_xgb.best_value, study_xgb.best_params)
print("CAT Best Acc:", 1-study_cat.best_value, study_cat.best_params)

Output hidden; open in https://colab.research.google.com to view.

In [19]:
# Optuna 실행
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(tune_xgb, n_trials=NTRIAL)

[I 2025-05-28 17:05:58,325] A new study created in memory with name: no-name-dc98e9f5-328d-4110-8fc6-c20da26ead34
[I 2025-05-28 17:06:02,571] Trial 0 finished with value: 0.10471273938384673 and parameters: {'lr': 0.030222960491204386, 'max_depth': 11, 'subsample': 0.7266186373756075, 'colsample_bytree': 0.5425325224265484, 'gamma': 1.0787473798847458, 'min_child_weight': 1, 'reg_lambda': 0.3517433027609246}. Best is trial 0 with value: 0.10471273938384673.
[I 2025-05-28 17:06:04,101] Trial 1 finished with value: 0.11883430474604495 and parameters: {'lr': 0.05018759393359655, 'max_depth': 4, 'subsample': 0.5415293268417073, 'colsample_bytree': 0.6820287428489263, 'gamma': 3.790631501647777, 'min_child_weight': 8, 'reg_lambda': 2.480135171407362}. Best is trial 0 with value: 0.10471273938384673.
[I 2025-05-28 17:06:06,589] Trial 2 finished with value: 0.11856786011656961 and parameters: {'lr': 0.0170029601308787, 'max_depth': 6, 'subsample': 0.8588119408522628, 'colsample_bytree': 0.964

In [20]:
study_cat = optuna.create_study(direction='minimize'); study_cat.optimize(tune_cat, n_trials=NTRIAL)

[I 2025-05-28 17:15:26,156] A new study created in memory with name: no-name-70a925eb-e3b8-47c2-a757-2fe12d6a1242
[I 2025-05-28 17:15:30,420] Trial 0 finished with value: 0.11413821815154035 and parameters: {'depth': 5, 'lr': 0.2502154092218885, 'l2_leaf_reg': 6.350636474947447, 'bagging_temperature': 0.6236894272229467, 'border_count': 179}. Best is trial 0 with value: 0.11413821815154035.
[I 2025-05-28 17:15:48,303] Trial 1 finished with value: 0.11130724396336389 and parameters: {'depth': 10, 'lr': 0.020215029778073004, 'l2_leaf_reg': 8.326626190301194, 'bagging_temperature': 0.502655779427113, 'border_count': 100}. Best is trial 1 with value: 0.11130724396336389.
[I 2025-05-28 17:15:50,607] Trial 2 finished with value: 0.12819317235636984 and parameters: {'depth': 4, 'lr': 0.015987275458910405, 'l2_leaf_reg': 1.0714348009144852, 'bagging_temperature': 0.05450997182017803, 'border_count': 227}. Best is trial 1 with value: 0.11130724396336389.
[I 2025-05-28 17:15:55,007] Trial 3 fini

In [21]:
print("LGB Best Acc:", 1-study_lgb.best_value, study_lgb.best_params)
print("XGB Best Acc:", 1-study_xgb.best_value, study_xgb.best_params)
print("CAT Best Acc:", 1-study_cat.best_value, study_cat.best_params)

LGB Best Acc: 0.8984845961698584 {'lr': 0.045396498700576694, 'num_leaves': 125, 'max_depth': 12, 'feature_fraction': 0.5184870762616136, 'bagging_fraction': 0.7293550235168924, 'bagging_freq': 1, 'lambda_l1': 0.43956960476577256, 'lambda_l2': 0.9219797780232823}
XGB Best Acc: 0.9003164029975022 {'lr': 0.032532111147427586, 'max_depth': 11, 'subsample': 0.8970969017618952, 'colsample_bytree': 0.5179945005269921, 'gamma': 0.016508932515318583, 'min_child_weight': 1, 'reg_lambda': 0.2715947195109578}
CAT Best Acc: 0.8945545378850956 {'depth': 10, 'lr': 0.0684726764578326, 'l2_leaf_reg': 2.18456293400133, 'bagging_temperature': 0.9837122454050633, 'border_count': 150}


In [22]:
# ===============================
# 8. 스태킹 앙상블 학습
# ===============================
lgb_best = LGBMClassifier(**study_lgb.best_params, n_estimators=400, random_state=SEED)
xgb_best = XGBClassifier(**study_xgb.best_params, n_estimators=400, random_state=SEED, tree_method='hist', verbosity=0)
cat_p    = study_cat.best_params.copy()
cat_p['learning_rate'] = cat_p.pop('lr')
cat_best = CatBoostClassifier(**cat_p, iterations=400, random_seed=SEED, verbose=False)

meta = LGBMClassifier(objective='binary', n_estimators=200, learning_rate=0.05,
                      num_leaves=31, random_state=SEED)

stack = StackingClassifier(
    estimators=[('lgb',lgb_best),('xgb',xgb_best),('cat',cat_best)],
    final_estimator=meta, cv=skf, n_jobs=-1
)
stack.fit(X_tr, y_tr)

[LightGBM] [Info] Number of positive: 23704, number of negative: 6321
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 764
[LightGBM] [Info] Number of data points in the train set: 30025, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.789475 -> initscore=1.321766
[LightGBM] [Info] Start training from score 1.321766


In [23]:
# ===============================
# 9. 임계값 최적화
# ===============================
proba_va = stack.predict_proba(X_va)[:,1]
ths = np.linspace(0.1,0.9,81)
accs = [accuracy_score(y_va, (proba_va>t).astype(int)) for t in ths]
best_idx = np.argmax(accs)
best_thr = ths[best_idx]
print(f"Best val Acc {accs[best_idx]:.4f} at threshold {best_thr}")

Best val Acc 0.9039 at threshold 0.64


In [24]:
# ===============================
# 10. 모델·전처리기 저장
# ===============================
joblib.dump(iso,            f'{OUT_DIR}/iso_model.pkl')
joblib.dump(vt,             f'{OUT_DIR}/vt_model.pkl')
joblib.dump(scaler,         f'{OUT_DIR}/scaler.pkl')
joblib.dump(stack,          f'{OUT_DIR}/stack_model.pkl')
joblib.dump(selected_feats, f'{OUT_DIR}/selected_feats.pkl')
joblib.dump(best_thr,       f'{OUT_DIR}/best_threshold.pkl')
print("✅ All models saved to", OUT_DIR)

✅ All models saved to /content/drive/MyDrive/25년 해군 AI 경진대회/model_result


In [25]:
# ===============================
# 11. 새 데이터 예측 및 제출 파일 생성
# ===============================
# 11-1) 전처리기·모델 로드
iso_loaded    = joblib.load(f'{OUT_DIR}/iso_model.pkl')       # (학습때만 사용)
vt_loaded     = joblib.load(f'{OUT_DIR}/vt_model.pkl')
scaler_loaded = joblib.load(f'{OUT_DIR}/scaler.pkl')
stack_loaded  = joblib.load(f'{OUT_DIR}/stack_model.pkl')
feats_loaded  = joblib.load(f'{OUT_DIR}/selected_feats.pkl')
thr_loaded    = joblib.load(f'{OUT_DIR}/best_threshold.pkl')

In [26]:
# 11-2) 신규 AIS 데이터
NEW_FILE = '/content/20240701(1번문제).csv'
raw_new  = pd.read_csv(NEW_FILE)

# 11-3) 피처 엔지니어링
feat_new = aggregate_ais(raw_new)
feat_new['rule_score'] = feat_new.apply(rule_score, axis=1)

# 11-4) 예측용 행렬 생성
Xn = feat_new[feat_cols].values
Xn = scaler_loaded.transform(Xn)
Xn = vt_loaded.transform(Xn)

# 11-5) 예측 & 임계값 적용
proba = stack_loaded.predict_proba(Xn)[:,1]
feat_new['result'] = np.where(proba>thr_loaded, 'TRUE','FALSE')

# 11-6) 제출 파일 저장
submission = feat_new[['MMSI','result']]
submission.to_csv(f'{OUT_DIR}/submission_ver5.csv', index=False)
print("✅ submission.csv saved to", OUT_DIR)

✅ submission.csv saved to /content/drive/MyDrive/25년 해군 AI 경진대회/model_result
