In [1]:
# ===============================
# 0-1. 필수 라이브러리
# ===============================
!pip install lightgbm catboost shap optuna geopandas shapely pyproj fiona pyarrow --quiet

# ===============================
# 0-2. Google Drive 마운트
# ===============================
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os, re, glob, warnings, joblib, numpy as np, pandas as pd, geopandas as gpd
from datetime import datetime, timedelta

DATA_DIR = '/content/drive/MyDrive/25년 해군 AI 경진대회/dataset/의심선박 훈련용 데이터 셋'
ZONE_DIR = '/content/drive/MyDrive/25년 해군 AI 경진대회/dataset/구역 데이터'

csv_files = sorted(glob.glob(os.path.join(DATA_DIR, '*.csv')))
ymd = lambda f: datetime.strptime(re.search(r'(\d{8})', os.path.basename(f)).group(1), '%Y%m%d')
file_dates = [(f, ymd(f)) for f in csv_files]
file_dates.sort(key=lambda x: x[1])

last_date   = file_dates[-1][1]
train_start = last_date - timedelta(days=92)
train_files = [f for f,d in file_dates if d >= train_start]
val_files   = [f for f,d in file_dates if d <  train_start]

print(f'Train {len(train_files)}개 : {train_files[0][-12:-4]} ~ {train_files[-1][-12:-4]}')
print(f'Val   {len(val_files)}개 : {val_files[0][-12:-4]} ~ {val_files[-1][-12:-4]}')


Train 93개 : 20240229 ~ 20240531
Val   274개 : 20230601 ~ 20240228


In [4]:
from shapely.geometry import Polygon
from shapely import wkt

def detect_wkt_col(df):
    for c in df.columns:
        if df[c].dtype=='object':
            sample = df[c].dropna().astype(str).str.upper()
            if sample.str.startswith(('POLYGON','MULTIPOLYGON','LINESTRING')).any():
                return c
    return None

def load_zones(zone_dir):
    files = glob.glob(os.path.join(zone_dir, '*'))
    gdfs  = []

    for fp in files:
        ext = os.path.splitext(fp)[1].lower()

        # 1) 벡터 포맷
        if ext in ['.shp','.geojson','.json','.gpkg','.kml']:
            try:
                g = gpd.read_file(fp).to_crs('EPSG:4326')
                if 'zone_type' not in g.columns:
                    g['zone_type'] = re.sub(r'\.shp|\.geojson|\.json|area_','',
                                            os.path.basename(fp), flags=re.I)
                gdfs.append(g)
                print(f"Loaded vector : {os.path.basename(fp)} ({len(g)})")
            except Exception as e:
                print(f"[Warn] {fp} read fail: {e}")

        # 2) CSV
        elif ext=='.csv':
            df = pd.read_csv(fp)
            wcol = detect_wkt_col(df)

            if wcol:
                df['geometry'] = df[wcol].apply(wkt.loads)
                g = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')
            elif {'LON','LAT'} <= {c.upper() for c in df.columns}:
                lon = [c for c in df.columns if c.upper()=='LON'][0]
                lat = [c for c in df.columns if c.upper()=='LAT'][0]

                group_col = [c for c in df.columns if c not in [lon, lat]][0]
                df['_gid_'] = df[group_col]

                gb = df.groupby('_gid_')

                # pandas 2.1 이상은 include_groups=False 지원
                try:
                    polys = (gb.apply(
                                lambda g: Polygon(list(zip(g[lon], g[lat]))),
                                include_groups=False)
                               .reset_index(name='geometry'))
                except TypeError:          # ← pandas<2.1 fallback
                    polys = (gb.apply(
                                lambda g: Polygon(list(zip(g[lon], g[lat]))))
                               .reset_index(name='geometry'))

                gdf = gpd.GeoDataFrame(polys, geometry='geometry', crs='EPSG:4326')
                inferred = re.sub(r'\.csv$', '', os.path.basename(fp), flags=re.I)
                gdf['zone_type'] = inferred
                gdfs.append(gdf)
                print(f"Loaded CSV pts: {os.path.basename(fp)} ({len(gdf)}) [group: {group_col}]")

    if not gdfs:
        raise RuntimeError("❌ 구역 데이터를 찾을 수 없습니다.")

    return pd.concat(gdfs, ignore_index=True)

zones_gdf = load_zones(ZONE_DIR)
print('zone types:', zones_gdf['zone_type'].unique())

Loaded CSV pts: Area_Navy_train.csv (19) [group: OBJNUM]
Loaded CSV pts: Area_Near_Sea.csv (46) [group: id]
Loaded CSV pts: Area_Restrict_zone.csv (135) [group: OBJNUM]
Loaded CSV pts: Area_Sea_cable_lv1_poly.csv (23) [group: OBJNUM]
Loaded CSV pts: Area_Special_restrict_zone_poly.csv (2) [group: OBJNUM]
Loaded CSV pts: Area_Target_Area.csv (64) [group: OBJNUM]
zone types: ['Area_Navy_train' 'Area_Near_Sea' 'Area_Restrict_zone'
 'Area_Sea_cable_lv1_poly' 'Area_Special_restrict_zone_poly'
 'Area_Target_Area']


In [9]:
# ----- 구역 이름 매핑 -----
ANCHOR_ZONE        = 'Area_Near_Sea'
SPECIAL_RESTRICT   = 'Area_Special_restrict_zone_poly'
NAVY_TRAIN         = 'Area_Navy_train'
CABLE_ZONE         = 'Area_Sea_cable_lv1_poly'
NO_ENTRY_ZONE      = 'Area_Restrict_zone'

# ----- 유틸: 컬럼 자동 매핑 -----
import re

def _normalize(s:str) -> str:
    """공백·_·- 제거, 소문자화"""
    return re.sub(r'[\s_\-]', '', s.lower())

def find_col(df: pd.DataFrame, patterns, required: bool=True):
    """
    df 안에서 patterns(리스트)의 어느 것이라도 '포함'하는 첫 열 반환.
    - 대소문자 무시
    - 공백, 밑줄, 하이픈 모두 무시
    예) 'Base Date Time', 'base_datetime_utc', 'TimeStamp' → 매칭
    """
    pats_norm = [_normalize(p) for p in patterns]

    for col in df.columns:
        col_norm = _normalize(col)
        # 완전 일치 또는 startswith / substring 매칭
        if any(col_norm == p or col_norm.startswith(p) or p in col_norm for p in pats_norm):
            return col

    if required:
        raise KeyError(f"❌ 필수 컬럼 패턴 {patterns} 에 해당하는 열을 찾지 못했습니다.\n"
                       f" → 실제 열 목록: {df.columns.tolist()}")
    return None


In [10]:
def aggregate_ais(df):
    # 실제 컬럼명 탐색 & 리네임
    df = df.copy()
    df.rename(columns={
        find_col(df,['basedatetime','timestamp','datetime','time']):'Time',
        find_col(df,['latitude','lat']):'Latitude',
        find_col(df,['longitude','lon']):'Longitude',
        find_col(df,['sog','speed']):'SOG',
        find_col(df,['cog','course']):'COG',
        find_col(df,['mmsi']):'MMSI'
    }, inplace=True)

    df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
    df = df.dropna(subset=['Time','Latitude','Longitude'])
    df = df.sort_values(['MMSI','Time'])

    gdf = gpd.GeoDataFrame(df,
            geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']),
            crs='EPSG:4326')

    pts = gpd.sjoin(gdf, zones_gdf[['zone_type','geometry']],
                    how='left', predicate='within')

    for zt in zones_gdf['zone_type'].unique():
        pts[f'{zt}_flag'] = (pts['zone_type']==zt).astype(int)
    pts = pts.drop(columns=['index_right','zone_type','geometry'])

    def _agg(g):
        gaps = np.diff(g['Time'].values).astype('timedelta64[s]').astype(float)
        gaps = np.concatenate(([0], gaps)); dur = gaps.copy(); dur[dur==0] = 1

        dlat = np.radians(g['Latitude'].diff().fillna(0))
        dlon = np.radians(g['Longitude'].diff().fillna(0))
        a = np.sin(dlat/2)**2 + np.cos(np.radians(g['Latitude']))*\
            np.cos(np.radians(g['Latitude'].shift().fillna(g['Latitude']))) * np.sin(dlon/2)**2
        g['dist_km'] = 2*6371*np.arcsin(np.sqrt(a)).fillna(0)

        cog_diff = np.abs(np.mod(g['COG'].diff()+180,360)-180)
        sharp_turn = ((cog_diff>=90) & (g[f'{ANCHOR_ZONE}_flag']==0)).sum()

        res = {
            'num_pts':len(g),
            'traj_dur':(g['Time'].iloc[-1]-g['Time'].iloc[0]).total_seconds(),
            'mean_sog':g['SOG'].mean(),
            'std_sog': g['SOG'].std(),
            'lowspd_out_anchor_sec':
                ((g['SOG']<=5)&(g[f'{ANCHOR_ZONE}_flag']==0)).dot(dur),
            'nomove_out_anchor_sec':
                ((g['dist_km']<0.05)&(g[f'{ANCHOR_ZONE}_flag']==0)).dot(dur),
            'ais_off_cnt_out_anchor':
                ((gaps>1800)&(g[f'{ANCHOR_ZONE}_flag']==0)).sum(),
            'restrict_train_sec':
                ((g[[f'{SPECIAL_RESTRICT}_flag',f'{NAVY_TRAIN}_flag']].max(axis=1)==1)).dot(dur),
            'sharp_turn_cnt': sharp_turn,
            'cable_lowspd_sec':
                ((g[f'{CABLE_ZONE}_flag']==1)&(g['SOG']<=5)).dot(dur),
            'no_entry_flag': int((g[f'{NO_ENTRY_ZONE}_flag']==1).any())
        }
        for zt in zones_gdf['zone_type'].unique():
            res[f'{zt}_flag'] = g[f'{zt}_flag'].max()
        return pd.Series(res)

    agg = pts.groupby('MMSI', group_keys=False).apply(_agg).reset_index()

    if 'result' in df.columns:
        agg = agg.merge(df[['MMSI','result']].drop_duplicates('MMSI'),
                        on='MMSI', how='left')
    return agg


In [11]:
def rule_score(r):
    score=0
    if (r['lowspd_out_anchor_sec']>=3600) or (r['nomove_out_anchor_sec']>=3600): score+=.20
    if r['ais_off_cnt_out_anchor']>=3:      score+=.15
    if r['restrict_train_sec']>=3600:       score+=.20
    if r['sharp_turn_cnt']>=1:              score+=.15
    if r['cable_lowspd_sec']>=600:          score+=.15   # 10분
    if r['no_entry_flag']==1:               score+=.15
    return score*100

In [12]:
def build_dataset(file_list):
    raw = pd.concat([pd.read_csv(f) for f in file_list], ignore_index=True)
    df  = aggregate_ais(raw)
    df['rule_score'] = df.apply(rule_score, axis=1)
    df['target'] = df['result'].map({'TRUE':1,'True':1,True:1,1:1,
                                     'FALSE':0,'False':0,False:0,0:0})
    num = df.select_dtypes('number').columns
    df[num] = df[num].replace([np.inf,-np.inf], np.nan)
    return df

print('⏳ 집계 → Train'); train_df = build_dataset(train_files)
print('⏳ 집계 → Val');   val_df   = build_dataset(val_files)
print(train_df.shape, val_df.shape)


⏳ 집계 → Train


  agg = pts.groupby('MMSI', group_keys=False).apply(_agg).reset_index()


⏳ 집계 → Val


  agg = pts.groupby('MMSI', group_keys=False).apply(_agg).reset_index()


(30329, 21) (51883, 21)


In [14]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
import optuna, warnings
warnings.filterwarnings('ignore')

SEED, NTRIAL, NSPLIT = 42, 40, 5
feat_cols = [c for c in train_df.columns if c not in ('MMSI','result','target')]

scaler = RobustScaler()
X_tr = scaler.fit_transform(train_df[feat_cols]); y_tr = train_df['target'].astype(int)
X_va = scaler.transform(val_df[feat_cols]);       y_va = val_df['target'].astype(int)
skf = StratifiedKFold(n_splits=NSPLIT, shuffle=True, random_state=SEED)

In [16]:
def tune_lgb(trial):
    p = {'objective':'binary','metric':'binary_logloss','seed':SEED,
         'learning_rate':trial.suggest_float('lr',0.01,0.3,log=True),
         'num_leaves':trial.suggest_int('leaves',16,128,log=True),
         'max_depth':trial.suggest_int('depth',3,12),
         'feature_fraction':trial.suggest_float('feat',0.5,1),
         'bagging_fraction':trial.suggest_float('bag',0.5,1),
         'bagging_freq':trial.suggest_int('bagf',1,10),
         'lambda_l1':trial.suggest_float('l1',0,5),
         'lambda_l2':trial.suggest_float('l2',0,5)}

In [23]:
import optuna
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# 5-1. LightGBM 하이퍼파라미터 튜닝
def tune_lgb(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'seed': SEED,
        'learning_rate': trial.suggest_float('lr', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 128, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
    }
    cv_scores = []
    for tr_idx, va_idx in skf.split(X_tr, y_tr):
        model = LGBMClassifier(**params, n_estimators=400, random_state=SEED)
        model.fit(
            X_tr[tr_idx], y_tr[tr_idx],
            eval_set=[(X_tr[va_idx], y_tr[va_idx])],
            callbacks=[
                lgb.early_stopping(stopping_rounds=30),
                lgb.log_evaluation(period=0)   # 출력 끄기
            ]
        )
        preds = (model.predict_proba(X_tr[va_idx])[:,1] > 0.5).astype(int)
        cv_scores.append(accuracy_score(y_tr[va_idx], preds))
    return 1.0 - np.mean(cv_scores)

study_lgb = optuna.create_study(direction='minimize', study_name='optuna_lgb')
study_lgb.optimize(tune_lgb, n_trials=NTRIAL, show_progress_bar=True)
print("✅ LightGBM Best Accuracy:", 1 - study_lgb.best_value)
print("   Best Params:", study_lgb.best_params)

[I 2025-05-28 12:37:44,487] A new study created in memory with name: optuna_lgb


  0%|          | 0/40 [00:00<?, ?it/s]

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Did not meet early stopping. Best iteration is:
[399]	valid_0's binary_logloss: 0.265188
[LightGBM] [Info] Number of positive: 19169, number of negative: 5094
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001113 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1348
[LightGBM] [Info] Number of data points in the train set: 24263, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.790051 -> initscore=1.325231
[LightGBM] [Info] Start training from score 1.325231
Training until validation scores don't improve for 30 rounds
Did not meet early stopping. Best iteration is:
[400]	valid_0's binary_logloss: 0.271612
[LightGBM] [Info] Number of positive: 19169, number of negative: 5094
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 

In [25]:
# 5-2. XGBoost 하이퍼파라미터 튜닝
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

def tune_xgb(trial):
    params = {
        'objective':          'binary:logistic',
        'learning_rate':      trial.suggest_float('lr', 0.01, 0.3, log=True),
        'max_depth':          trial.suggest_int('max_depth', 3, 12),
        'subsample':          trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree':   trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma':              trial.suggest_float('gamma', 0.0, 5.0),
        'min_child_weight':   trial.suggest_int('min_child_weight', 1, 20),
        'reg_lambda':         trial.suggest_float('reg_lambda', 0.0, 5.0),
        'n_estimators':       400,
        'tree_method':        'hist',
        'verbosity':          0,
        'random_state':       SEED,
        'use_label_encoder':  False
    }
    cv_scores = []
    for tr_idx, va_idx in skf.split(X_tr, y_tr):
        model = XGBClassifier(**params)
        # eval_set / early stopping 제거
        model.fit(X_tr[tr_idx], y_tr[tr_idx])
        preds = (model.predict_proba(X_tr[va_idx])[:,1] > 0.5).astype(int)
        cv_scores.append(accuracy_score(y_tr[va_idx], preds))
    return 1.0 - np.mean(cv_scores)

# 다시 Optuna 실행
study_xgb = optuna.create_study(direction='minimize', study_name='optuna_xgb')
study_xgb.optimize(tune_xgb, n_trials=NTRIAL, show_progress_bar=True)
print("✅ XGBoost Best Accuracy:", 1 - study_xgb.best_value)
print("   Best Params:", study_xgb.best_params)

[I 2025-05-28 12:41:50,593] A new study created in memory with name: optuna_xgb


  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-05-28 12:41:55,470] Trial 0 finished with value: 0.10577353426678615 and parameters: {'lr': 0.11072529938654423, 'max_depth': 5, 'subsample': 0.6285424479906293, 'colsample_bytree': 0.6860410373579109, 'gamma': 3.4575403045966056, 'min_child_weight': 20, 'reg_lambda': 4.410726219048838}. Best is trial 0 with value: 0.10577353426678615.
[I 2025-05-28 12:41:56,910] Trial 1 finished with value: 0.106861516992663 and parameters: {'lr': 0.26208419737079136, 'max_depth': 3, 'subsample': 0.6737908489097673, 'colsample_bytree': 0.7826637262765175, 'gamma': 2.487951637140475, 'min_child_weight': 13, 'reg_lambda': 4.611320693939415}. Best is trial 0 with value: 0.10577353426678615.
[I 2025-05-28 12:41:58,678] Trial 2 finished with value: 0.10478431673139854 and parameters: {'lr': 0.033514559136530446, 'max_depth': 8, 'subsample': 0.7750593726011732, 'colsample_bytree': 0.7109762702079143, 'gamma': 4.249173997564651, 'min_child_weight': 3, 'reg_lambda': 0.6402265275209018}. Best is trial 

In [26]:
# 5-3. CatBoost 하이퍼파라미터 튜닝
def tune_cat(trial):
    params = {
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('lr', 0.01, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'iterations': 400,
        'loss_function': 'Logloss',
        'eval_metric': 'Accuracy',
        'random_seed': SEED,
        'verbose': False,
        'auto_class_weights': 'Balanced'
    }
    cv_scores = []
    for tr_idx, va_idx in skf.split(X_tr, y_tr):
        m = CatBoostClassifier(**params)
        m.fit(
            X_tr[tr_idx], y_tr[tr_idx],
            eval_set=(X_tr[va_idx], y_tr[va_idx]),
            early_stopping_rounds=30,
            verbose=False
        )
        preds = (m.predict_proba(X_tr[va_idx])[:,1] > 0.5).astype(int)
        cv_scores.append(accuracy_score(y_tr[va_idx], preds))
    return 1.0 - np.mean(cv_scores)

study_cat = optuna.create_study(direction='minimize', study_name='optuna_cat')
study_cat.optimize(tune_cat, n_trials=NTRIAL, show_progress_bar=True)
print("✅ CatBoost Best Accuracy:", 1 - study_cat.best_value)
print("   Best Params:", study_cat.best_params)

[I 2025-05-28 12:44:44,567] A new study created in memory with name: optuna_cat


  0%|          | 0/40 [00:00<?, ?it/s]

[I 2025-05-28 12:44:54,870] Trial 0 finished with value: 0.12905145352211134 and parameters: {'depth': 10, 'lr': 0.012195054353588327, 'l2_leaf_reg': 9.448878232916494, 'bagging_temperature': 0.04533926871145211, 'border_count': 206}. Best is trial 0 with value: 0.12905145352211134.
[I 2025-05-28 12:45:00,188] Trial 1 finished with value: 0.13053516022841904 and parameters: {'depth': 5, 'lr': 0.014841440758967723, 'l2_leaf_reg': 9.689892280331229, 'bagging_temperature': 0.38691226878470597, 'border_count': 89}. Best is trial 0 with value: 0.12905145352211134.
[I 2025-05-28 12:45:03,819] Trial 2 finished with value: 0.1253256715290909 and parameters: {'depth': 5, 'lr': 0.27879228306879417, 'l2_leaf_reg': 8.43270002727075, 'bagging_temperature': 0.18650456170909957, 'border_count': 171}. Best is trial 2 with value: 0.1253256715290909.
[I 2025-05-28 12:45:08,389] Trial 3 finished with value: 0.12829311212279104 and parameters: {'depth': 4, 'lr': 0.060172064007693815, 'l2_leaf_reg': 2.7391

In [28]:
# 5-4. 스태킹 앙상블 학습 & 평가 (CatBoost lr → learning_rate 매핑)
from copy import deepcopy

lgb_best = LGBMClassifier(**study_lgb.best_params,
                          n_estimators=400,
                          random_state=SEED)

xgb_best = XGBClassifier(**study_xgb.best_params,
                         n_estimators=400,
                         random_state=SEED,
                         tree_method='hist',
                         verbosity=0)

# CatBoost 파라미터 수정: 'lr' → 'learning_rate'
cat_params = deepcopy(study_cat.best_params)
cat_params['learning_rate'] = cat_params.pop('lr')
cat_best = CatBoostClassifier(**cat_params,
                              iterations=400,
                              random_seed=SEED,
                              verbose=False)

meta_learner = LGBMClassifier(objective='binary',
                              n_estimators=200,
                              learning_rate=0.05,
                              num_leaves=31,
                              random_state=SEED)

stack = StackingClassifier(
    estimators=[('lgb', lgb_best), ('xgb', xgb_best), ('cat', cat_best)],
    final_estimator=meta_learner,
    cv=skf,
    n_jobs=-1
)

print('🚀 스태킹 모델 학습 중…')
stack.fit(X_tr, y_tr)

val_preds = (stack.predict_proba(X_va)[:,1] > 0.5).astype(int)
val_acc = accuracy_score(y_va, val_preds)
print(f'✅ 검증 정확도: {val_acc:.4f}')

🚀 스태킹 모델 학습 중…
[LightGBM] [Info] Number of positive: 23961, number of negative: 6368
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 30329, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.790036 -> initscore=1.325142
[LightGBM] [Info] Start training from score 1.325142
✅ 검증 정확도: 0.8970


In [29]:
import os
import joblib
import pandas as pd
import numpy as np

# 1) 저장 디렉토리 설정
OUT_DIR = '/content/drive/MyDrive/25년 해군 AI 경진대회/model_result'
os.makedirs(OUT_DIR, exist_ok=True)

# 2) 모델·스케일러 저장
joblib.dump(stack,  f'{OUT_DIR}/stack_model.pkl')
joblib.dump(scaler, f'{OUT_DIR}/scaler.pkl')
print("✅ Models saved to", OUT_DIR)

# 3) 저장한 모델·스케일러 로드
stack_loaded  = joblib.load(f'{OUT_DIR}/stack_model.pkl')
scaler_loaded = joblib.load(f'{OUT_DIR}/scaler.pkl')

✅ Models saved to /content/drive/MyDrive/25년 해군 AI 경진대회/model_result


In [30]:
# 4) 새로운 AIS 데이터 읽기
NEW_FILE = '/content/20240701(1번문제).csv'
raw_new  = pd.read_csv(NEW_FILE)

# 5) 피처 엔지니어링 적용
feat_new = aggregate_ais(raw_new)                  # aggregate_ais 함수 필요
feat_new['rule_score'] = feat_new.apply(rule_score, axis=1)  # rule_score 함수 필요

# 6) 예측에 사용할 feature 리스트
feat_cols = [c for c in feat_new.columns if c not in ('MMSI','result','target')]

# 7) 스케일링 및 예측
X_new = scaler_loaded.transform(feat_new[feat_cols])
proba = stack_loaded.predict_proba(X_new)[:,1]
feat_new['result'] = np.where(proba > 0.5, 'TRUE', 'FALSE')

# 8) 제출용 CSV 생성 (MMSI, result만)
submission = feat_new[['MMSI','result']]
submission.to_csv(f'{OUT_DIR}/submission.csv', index=False)
print("✅ submission.csv saved to", OUT_DIR)

✅ submission.csv saved to /content/drive/MyDrive/25년 해군 AI 경진대회/model_result
