In [1]:
# =======================================================
# =======================================================
!pip install --quiet xgboost==2.0.3 scikit-learn==1.4.2 pandas==2.2.2 \
                 geopandas shapely pyproj fiona

from google.colab import drive
drive.mount('/content/drive')

import os, datetime, json, joblib, pandas as pd, geopandas as gpd
from shapely.geometry import Polygon, box
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    roc_auc_score, confusion_matrix
)
from xgboost import XGBClassifier

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m116.5 MB/s[0m eta [36m0:00:00[0m
[?25hDrive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# -------------------------------------------------------
# -------------------------------------------------------
DATA_PATH = '/content/drive/MyDrive/25년 해군 AI 경진대회/combined_ais_20230601_20240531.csv'
ZONE_DIR  = '/content/drive/MyDrive/25년 해군 AI 경진대회/dataset/구역 데이터'
MODEL_DIR = '/content/drive/MyDrive/25년 해군 AI 경진대회/model_result'
os.makedirs(MODEL_DIR, exist_ok=True)

zone_files = [
    ('Area_Navy_train.csv'                   , 'navy_train'),
    ('Area_Near_Sea.csv'                     , 'near_sea'),
    ('Area_Restrict_zone.csv'                , 'restrict'),
    ('Area_Sea_cable_lv1_poly.csv'           , 'sea_cable'),
    ('Area_Special_restrict_zone_poly.csv'   , 'special_restrict'),
    ('Area_Target_Area.csv'                  , 'target_area')
]

In [10]:
def load_zone_csv(file_path: str, label: str) -> gpd.GeoDataFrame:
    """
    CSV → Polygon(s)
    - 필수: LON, LAT  (대·소문자 허용)
    - 그룹 컬럼: OBJNUM* 계열 → 여러 다각형, 없으면 파일 전체 하나로 처리
    - Polygon 생성 실패 시 convex_hull 대체, 그래도 불가하면 스킵
    """
    df = pd.read_csv(file_path)

    # ① 좌표 컬럼 찾기
    lon_col = next(c for c in df.columns if c.lower() == 'lon')
    lat_col = next(c for c in df.columns if c.lower() == 'lat')

    # ② 그룹 컬럼 찾기
    grp_candidates = [c for c in df.columns if c.lower().startswith('obj')]
    group_col = grp_candidates[0] if grp_candidates else None

    polygons = []
    groups = df.groupby(group_col) if group_col else [('single', df)]
    for gid, grp in groups:
        coords = list(zip(grp[lon_col].values, grp[lat_col].values))
        if len(coords) < 3:
            print(f'⚠️  {os.path.basename(file_path)} - {gid}: 꼭짓점<3 → 건너뜀')
            continue
        if coords[0] != coords[-1]:
            coords.append(coords[0])

        poly = None
        try:
            poly = Polygon(coords)
            if not poly.is_valid:
                poly = poly.buffer(0)         # 자가교차 등 교정
        except Exception:
            pass

        # fallback: convex hull
        if (poly is None) or poly.is_empty or (not poly.is_valid):
            poly = gpd.GeoSeries([Polygon(coords)]).unary_union.convex_hull

        if poly.is_valid and (not poly.is_empty) and poly.area > 0:
            polygons.append(poly)
        else:
            print(f'⚠️  {os.path.basename(file_path)} - {gid}: 유효 Polygon 생성 실패')

    if not polygons:
        print(f'❗ {os.path.basename(file_path)}: usable Polygon 없음 → 파일 건너뜀')
        return gpd.GeoDataFrame(columns=['zone_type', 'geometry'], crs='EPSG:4326')

    return gpd.GeoDataFrame(
        {'zone_type': [label] * len(polygons), 'geometry': polygons},
        crs='EPSG:4326'
    )

In [11]:
zone_gdfs = [
    load_zone_csv(os.path.join(ZONE_DIR, fname), lbl)
    for fname, lbl in zone_files
]

# 빈 GeoDataFrame 은 자동으로 걸러냄
zones_gdf = pd.concat([g for g in zone_gdfs if not g.empty], ignore_index=True)
print('Zone polygons loaded:', len(zones_gdf))
zones_sindex = zones_gdf.sindex

Zone polygons loaded: 244


In [12]:
# -------------------------------------------------------
# 3. AIS 집계 데이터 로드 & BBox GeoDataFrame ------------
# -------------------------------------------------------
ais_df = pd.read_csv(DATA_PATH)
print('AIS rows:', len(ais_df))

bbox_polys = [
    box(min(lon1, lon2), min(lat1, lat2), max(lon1, lon2), max(lat1, lat2))
    for lon1, lon2, lat1, lat2 in zip(
        ais_df['min_lon'], ais_df['max_lon'], ais_df['min_lat'], ais_df['max_lat'])
]
ais_gdf = gpd.GeoDataFrame(ais_df, geometry=bbox_polys, crs='EPSG:4326')

AIS rows: 2678129


In [15]:
# -------------------------------------------------------
# 4. 구역 교차 Feature (벡터화·수정) ---------------------
# -------------------------------------------------------

# 4-1) spatial join
join_df = gpd.sjoin(
    ais_gdf[['geometry']],                 # 왼쪽: AIS BBox
    zones_gdf[['zone_type', 'geometry']],  # 오른쪽: Zones
    how='left',
    predicate='intersects'
).reset_index().rename(columns={'index': 'ais_idx'})
# └─ 이제 'ais_idx' 가 AIS 원본 행번호

# 4-2) zone_type × count pivot
pivot = (
    join_df.dropna(subset=['zone_type'])
           .assign(hit=1)
           .pivot_table(index='ais_idx', columns='zone_type',
                        values='hit', aggfunc='sum', fill_value=0)
)

# 4-3) in_{zone}, num_zone_hits
ais_gdf['num_zone_hits'] = (
    pivot.sum(axis=1)
         .reindex(ais_gdf.index, fill_value=0)
         .astype(int)
)

for z in zones_gdf['zone_type'].unique():
    ais_gdf[f'in_{z}'] = (
        (pivot.get(z, 0) > 0)
        .reindex(ais_gdf.index, fill_value=0)
        .astype(int)
)

print('⚡ 공간 feature 생성 완료 (vectorized)')

⚡ 공간 feature 생성 완료 (vectorized)


In [22]:
# -------------------------------------------------------
# 5. X / y 전처리 ---------------------------------------
# -------------------------------------------------------
def prepare_xy(df, drop_cols=None):
    if drop_cols is None:
        drop_cols = []
    df = df.copy()

    # y 변환 (True/False → 1/0)
    if 'label' in df.columns and 'result' not in df.columns:
        df.rename(columns={'label': 'result'}, inplace=True)
    df['result'] = (
        df['result'].astype(str).str.lower()
          .map({'true':1, 'false':0, '1':1, '0':0})
          .astype(int)
    )

    # 날짜(object) 컬럼 → 숫자 파생
    for col in df.select_dtypes(include=['object']).columns:
        if col == 'result':
            continue
        try:
            dt = pd.to_datetime(df[col], errors='raise')
            df[f'{col}_doy'] = dt.dt.dayofyear
            df[f'{col}_dow'] = dt.dt.weekday
            df.drop(columns=[col], inplace=True)
        except (ValueError, TypeError):
            df.drop(columns=[col], inplace=True)  # 문자 컬럼이 남아있다면 삭제

    y = df['result']
    X = df.drop(columns=drop_cols + ['result'])
    return X, y

X, y = prepare_xy(ais_gdf.drop(columns=['geometry']), drop_cols=['MMSI'])
print('Feature matrix shape:', X.shape)

Feature matrix shape: (2678129, 34)


In [23]:
# -------------------------------------------------------
# 6. Stratified Split ----------------------------------
# -------------------------------------------------------
X_tmp, X_test, y_tmp, y_test = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=42)
val_ratio = 0.15 / 0.85
X_train, X_val, y_train, y_val = train_test_split(
    X_tmp, y_tmp, test_size=val_ratio, stratify=y_tmp, random_state=42)

In [25]:
# -------------------------------------------------------
# 7. XGBoost 학습 + Early-Stopping -----------------------
# -------------------------------------------------------
base_params = dict(
    objective='binary:logistic',
    n_estimators=5000,           # 크게 두고 early-stop
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    tree_method='hist',          # GPU: 'gpu_hist'
    enable_categorical=True,
    eval_metric=['logloss','auc'],
    early_stopping_rounds=100
)

# 1) best_iter 찾기 (train → val)
tmp_model = XGBClassifier(**base_params)
tmp_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)
best_iter = tmp_model.best_iteration
print(f'◆ Early-stopped at round {best_iter}')

# 2) train+val 전체 데이터로 재학습 (best_iter 트리 수만)
final_params          = base_params.copy()
final_params.pop('n_estimators')
final_params.pop('early_stopping_rounds')

model = XGBClassifier(**final_params, n_estimators=best_iter)
model.fit(
    pd.concat([X_train, X_val]),
    pd.concat([y_train, y_val]),
    verbose=False
)

[0]	validation_0-logloss:0.62201	validation_0-auc:0.99814
[1]	validation_0-logloss:0.58222	validation_0-auc:0.99766
[2]	validation_0-logloss:0.54709	validation_0-auc:0.99870
[3]	validation_0-logloss:0.51237	validation_0-auc:0.99882
[4]	validation_0-logloss:0.48669	validation_0-auc:0.99843
[5]	validation_0-logloss:0.45705	validation_0-auc:0.99860
[6]	validation_0-logloss:0.43065	validation_0-auc:0.99862
[7]	validation_0-logloss:0.40555	validation_0-auc:0.99880
[8]	validation_0-logloss:0.38329	validation_0-auc:0.99879
[9]	validation_0-logloss:0.36368	validation_0-auc:0.99872
[10]	validation_0-logloss:0.34360	validation_0-auc:0.99880
[11]	validation_0-logloss:0.32617	validation_0-auc:0.99881
[12]	validation_0-logloss:0.30879	validation_0-auc:0.99886
[13]	validation_0-logloss:0.29253	validation_0-auc:0.99894
[14]	validation_0-logloss:0.27802	validation_0-auc:0.99893
[15]	validation_0-logloss:0.26427	validation_0-auc:0.99892
[16]	validation_0-logloss:0.25098	validation_0-auc:0.99901
[17]	va

In [26]:
# -------------------------------------------------------
# 8. 평가 ----------------------------------------------
# -------------------------------------------------------
def evaluate(tag, yt, yp, prob):
    acc = accuracy_score(yt, yp)
    pr, re, f1, _ = precision_recall_fscore_support(yt, yp, average='binary', zero_division=0)
    auc = roc_auc_score(yt, prob)
    cm  = confusion_matrix(yt, yp)
    print(f'\n[{tag}] Acc:{acc:.4f} | Prec:{pr:.4f} | Rec:{re:.4f} | F1:{f1:.4f} | AUC:{auc:.4f}')
    print('Confusion Matrix:\n', cm)

evaluate('Validation(OOS)', y_val, model.predict(X_val), model.predict_proba(X_val)[:,1])
evaluate('Test',            y_test, model.predict(X_test), model.predict_proba(X_test)[:,1])


[Validation(OOS)] Acc:0.9994 | Prec:0.9993 | Rec:0.9998 | F1:0.9995 | AUC:1.0000
Confusion Matrix:
 [[156559    178]
 [    53 244930]]

[Test] Acc:0.9968 | Prec:0.9965 | Rec:0.9983 | F1:0.9974 | AUC:1.0000
Confusion Matrix:
 [[155867    870]
 [   420 244563]]


In [27]:
# -------------------------------------------------------
# 9. 모델 & 메타데이터 저장 -----------------------------
# -------------------------------------------------------
today = datetime.date.today().isoformat()
model_path = os.path.join(MODEL_DIR, f'xgb_ais_zone_{today}.joblib')
joblib.dump(model, model_path)

with open(model_path.replace('.joblib', '.json'), 'w') as f:
    json.dump({
        'date'      : today,
        'train'     : len(X_train) + len(X_val),
        'test'      : len(X_test),
        'features'  : list(X.columns),
        'params'    : {k:v for k,v in base_params.items() if k!='early_stopping_rounds'},
        'best_iter' : int(best_iter)
    }, f, indent=2)

print(f'\n✅ 모델 저장 완료 → {model_path}')


✅ 모델 저장 완료 → /content/drive/MyDrive/25년 해군 AI 경진대회/model_result/xgb_ais_zone_2025-05-28.joblib


In [32]:
import time, os, numpy as np, pandas as pd, geopandas as gpd, joblib
from shapely.geometry import Polygon, box
from xgboost import XGBClassifier

t0 = time.time()
print('1) 모델·경로 설정…')
MODEL_PATH = '/content/drive/MyDrive/25년 해군 AI 경진대회/model_result/xgb_ais_zone_2025-05-28.joblib'   # 수정
CSV_PATH   = '/content/drive/MyDrive/25년 해군 AI 경진대회/1번문제/20240701(1번문제).csv'
ZONE_DIR   = '/content/drive/MyDrive/25년 해군 AI 경진대회/dataset/구역 데이터'
OUT_PATH   = CSV_PATH.replace('.csv', '_with_result.csv')

# ---------- 1. 구역 로드 ----------
def load_zone_csv(path, label):
    df = pd.read_csv(path)
    lon = next(c for c in df.columns if c.lower()=='lon')
    lat = next(c for c in df.columns if c.lower()=='lat')
    grp = next((c for c in df.columns if c.lower().startswith('obj')), None)
    polys=[]
    for _, g in (df.groupby(grp) if grp else [('s', df)]):
        pts=list(zip(g[lon], g[lat]));  pts.append(pts[0])
        p=Polygon(pts)
        if p.is_valid and not p.is_empty: polys.append(p)
    return gpd.GeoDataFrame({'zone_type':[label]*len(polys), 'geometry':polys},
                            crs='EPSG:4326')

zones=[]
for f,lbl in [('Area_Navy_train.csv','navy_train'), ('Area_Near_Sea.csv','near_sea'),
              ('Area_Restrict_zone.csv','restrict'), ('Area_Sea_cable_lv1_poly.csv','sea_cable'),
              ('Area_Special_restrict_zone_poly.csv','special_restrict'),
              ('Area_Target_Area.csv','target_area')]:
    path = os.path.join(ZONE_DIR, f)
    print(f'  • {f}')
    zones.append(load_zone_csv(path, lbl))
zones_gdf = pd.concat(zones, ignore_index=True)
print('   → zones loaded:', len(zones_gdf), 'polygons\n')

# ---------- 2. CSV 로드 ----------
print('2) 1번 문제 CSV 로드…')
gdf = pd.read_csv(CSV_PATH)
print('   → rows:', len(gdf))

# ---------- 2-1. BBox ----------
print('3) Bounding-box 생성…')
bbox = [box(min(a,b), min(c,d), max(a,b), max(c,d))
        for a,b,c,d in zip(gdf['min_lon'], gdf['max_lon'],
                           gdf['min_lat'], gdf['max_lat'])]
gdf = gpd.GeoDataFrame(gdf, geometry=bbox, crs='EPSG:4326')

# ---------- 2-2. sjoin ----------
print('4) 공간 join… (잠시 소요)')
join = gpd.sjoin(gdf[['geometry']].reset_index(),
                 zones_gdf[['zone_type','geometry']],
                 how='left', predicate='intersects').rename(columns={'index':'idx'})
print('   → join rows:', len(join))

pivot = (join.dropna(subset=['zone_type'])
              .assign(hit=1)
              .pivot_table(index='idx', columns='zone_type',
                           values='hit', aggfunc='sum', fill_value=0))

gdf['num_zone_hits'] = pivot.sum(axis=1).reindex(gdf.index, fill_value=0).astype(int)
for z in zones_gdf['zone_type'].unique():
    gdf[f'in_{z}'] = ((pivot.get(z,0)>0)
                      .reindex(gdf.index, fill_value=0).astype(int))

# ---------- 2-3. 날짜 파생 ----------
print('5) 날짜 → 숫자 파생…')
for col in gdf.select_dtypes(include=['object']).columns:
    if col in ('MMSI','result'): continue
    try:
        dt = pd.to_datetime(gdf[col], errors='raise')
        gdf[f'{col}_doy'] = dt.dt.dayofyear
        gdf[f'{col}_dow'] = dt.dt.weekday
        gdf.drop(columns=[col], inplace=True)
    except: gdf.drop(columns=[col], inplace=True)

# ---------- 3. 모델 & feature 정합 ----------
print('6) 모델 로드 & feature 맞추기…')
model:XGBClassifier = joblib.load(MODEL_PATH)
model_feats = model.get_booster().feature_names
for f in model_feats:
    if f not in gdf.columns:
        gdf[f] = 0
X_input = gdf[model_feats]

# ---------- 4. 예측 ----------
print('7) 예측 실행…')
prob = model.predict_proba(X_input)[:,1]
gdf['result'] = np.where(prob>=0.5, 'True', 'False')

# ---------- 5. 저장 ----------
print('8) CSV 저장…')
gdf.drop(columns=['geometry']).to_csv(OUT_PATH, index=False, encoding='utf-8-sig')
print(f"✅ 완료! {OUT_PATH}")
print('총 소요: %.1f 초' % (time.time()-t0))

1) 모델·경로 설정…
  • Area_Navy_train.csv
  • Area_Near_Sea.csv
  • Area_Restrict_zone.csv
  • Area_Sea_cable_lv1_poly.csv
  • Area_Special_restrict_zone_poly.csv
  • Area_Target_Area.csv
   → zones loaded: 242 polygons

2) 1번 문제 CSV 로드…
   → rows: 6779
3) Bounding-box 생성…
4) 공간 join… (잠시 소요)
   → join rows: 42094
5) 날짜 → 숫자 파생…
6) 모델 로드 & feature 맞추기…
7) 예측 실행…
8) CSV 저장…
✅ 완료! /content/drive/MyDrive/25년 해군 AI 경진대회/1번문제/20240701(1번문제)_with_result.csv
총 소요: 0.8 초
