In [1]:
import pandas as pd
from IPython.display import HTML
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split

In [None]:
df_type = pd.read_csv('../../preprocessed_data/시도_시군구별_보행자_사고_사고유형_전처리ver.csv')

# 피해 가중 점수 계산
df_risk = df_type.copy()
df_risk['피해가중점수'] = (
    df_risk['사망자수'] * 1 +
    df_risk['중상자수'] * 0.1 +
    df_risk['경상자수'] * 0.01 +
    df_risk['부상신고자수'] * 0.001
)

avg_by_area = df_risk.groupby('시군구')['피해가중점수'].mean().to_dict()        # 시군구별 평균 사고건수 feature 생성
type_weight = df_risk.groupby('사고유형')['피해가중점수'].mean()                 # 사고형태_가중치 = 사고유형별_누적_피해점수 / 전체_평균_피해점수
road_weight = df_risk.groupby('도로형태')['피해가중점수'].mean()                 # 도로형태_가중치 = 도로형태별_누적_피해점수 / 전체_평균_피해점수

df_risk['사고도로조합'] = df_risk['사고유형'] + '_' + df_risk['도로형태']                                        # 사고유형 + 도로형태 조합 변수 생성
df_risk['시군구_평균사고'] = df_risk['시군구'].map(avg_by_area)
df_risk['사고유형가중치'] = df_risk['사고유형'].map(type_weight)
df_risk['도로형태가중치'] = df_risk['도로형태'].map(road_weight)
df_risk['최종위험점수'] = df_risk['피해가중점수'] + 0.5 * (df_risk['사고유형가중치'] + df_risk['도로형태가중치'])       # 최종 위험 점수 계산 (기본 피해 점수 + 유형/도로 가중치 평균)

df_risk['위험도'] = pd.qcut(df_risk['최종위험점수'], q=5, labels=[1, 2, 3, 4, 5])                             # 위험도 5등급 (1=매우 안전 ~ 5=매우 위험)
df_risk['위험도'] = df_risk['위험도'].astype(str)

HTML(df_risk.head(5).to_html(escape=False))

Unnamed: 0,연도,시군구,법정동코드,사고내용,사망자수,중상자수,경상자수,부상신고자수,사고유형,도로형태,피해자 상해정도,피해가중점수,사고도로조합,시군구_평균사고,사고유형가중치,도로형태가중치,최종위험점수,위험도
0,2020,서울특별시 종로구 창성동,1111010500,중상사고,0,1,0,0,차대사람 - 차도통행중,교차로 - 교차로부근,중상,0.1,차대사람 - 차도통행중_교차로 - 교차로부근,0.1,0.131882,0.111938,0.22191,4
1,2020,서울특별시 종로구 창성동,1111010500,중상사고,0,1,0,0,차대사람 - 차도통행중,교차로 - 교차로안,중상,0.1,차대사람 - 차도통행중_교차로 - 교차로안,0.1,0.131882,0.117951,0.224917,4
2,2020,서울특별시 종로구 통인동,1111010800,경상사고,0,0,1,0,차대사람 - 기타,단일로 - 기타,경상,0.01,차대사람 - 기타_단일로 - 기타,0.01,0.092425,0.121204,0.116814,1
3,2020,서울특별시 종로구 누상동,1111010900,경상사고,0,0,1,0,차대사람 - 차도통행중,교차로 - 교차로안,경상,0.01,차대사람 - 차도통행중_교차로 - 교차로안,0.01,0.131882,0.117951,0.134917,2
4,2020,서울특별시 종로구 누상동,1111010900,경상사고,0,0,1,0,차대사람 - 기타,단일로 - 기타,경상,0.01,차대사람 - 기타_단일로 - 기타,0.01,0.092425,0.121204,0.116814,1


In [None]:
feature_cols = ['시군구', '사고유형', '도로형태', '시군구_평균사고', '사고도로조합']            # feature 선택
target_col = '위험도'

df_model = df_risk[feature_cols + [target_col]].dropna()

train_data, test_data = train_test_split(df_risk[feature_cols + [target_col]], test_size=0.2, random_state=42, stratify=df_risk[target_col])

# 모델 학습 (AutoGluon)
predictor = TabularPredictor(label=target_col, path="HighRiskRank/", problem_type='multiclass').fit(
    train_data=train_data,
    presets='high_quality',
    num_cpus=10,
    time_limit=900,
    verbosity=2
)

# predictor.save()                    # 모델 저장
predictor.evaluate(test_data)       # 성능 평가

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.4.0: Fri Apr 11 18:33:39 PDT 2025; root:xnu-11417.101.15~117/RELEASE_ARM64_T6020
CPU Count:          12
Memory Avail:       2.74 GB / 16.00 GB (17.1%)
Disk Space Avail:   11.84 GB / 460.43 GB (2.6%)
Presets specified: ['high_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model refit if memory is small relative to the data size.
	You can avoid this risk by setting `save_bag_folds=True`.
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input 

[36m(_ray_fit pid=19886)[0m [1000]	valid_set's multi_error: 0.3231


[33m(raylet)[0m [2025-06-05 16:22:30,331 E 19328 29509675] (raylet) file_system_monitor.cc:116: /tmp/ray/session_2025-06-05_16-18-57_964745_19122 is over 95% full, available space: 7.7342 GB; capacity: 460.432 GB. Object creation will fail if spilling is required.
[36m(_ray_fit pid=19815)[0m 	Ran out of time, early stopping on iteration 1415. Best iteration is:
[36m(_ray_fit pid=19815)[0m 	[1403]	valid_set's multi_error: 0.309819
[33m(raylet)[0m [2025-06-05 16:22:40,440 E 19328 29509675] (raylet) file_system_monitor.cc:116: /tmp/ray/session_2025-06-05_16-18-57_964745_19122 is over 95% full, available space: 7.73468 GB; capacity: 460.432 GB. Object creation will fail if spilling is required.
[36m(_ray_fit pid=19888)[0m 	Ran out of time, early stopping on iteration 1436. Best iteration is:[32m [repeated 7x across cluster][0m
[36m(_ray_fit pid=19888)[0m 	[1434]	valid_set's multi_error: 0.306838[32m [repeated 7x across cluster][0m
[33m(raylet)[0m [2025-06-05 16:22:50,495 

{'accuracy': 0.595355587808418,
 'balanced_accuracy': 0.5860691577804529,
 'mcc': 0.49793465678837673}

In [5]:
# 성능 확인
leaderboard = predictor.leaderboard(silent=True)
print(leaderboard)

                               model  score_val eval_metric  pred_time_val  \
0                    CatBoost_BAG_L1   0.598026    accuracy       0.289819   
1                WeightedEnsemble_L2   0.598026    accuracy       0.292404   
2               CatBoost_r177_BAG_L1   0.597711    accuracy       0.378128   
3                 CatBoost_r9_BAG_L1   0.591954    accuracy       0.219294   
4                     XGBoost_BAG_L1   0.584720    accuracy       0.788037   
5                  LightGBMXT_BAG_L1   0.583245    accuracy       0.813826   
6                LightGBM_r96_BAG_L1   0.582277    accuracy       2.806550   
7               LightGBM_r131_BAG_L1   0.580390    accuracy       1.115063   
8                    LightGBM_BAG_L1   0.579229    accuracy       0.256374   
9             NeuralNetFastAI_BAG_L1   0.576350    accuracy       0.351416   
10       NeuralNetFastAI_r191_BAG_L1   0.575576    accuracy       0.654792   
11              LightGBMLarge_BAG_L1   0.575358    accuracy     

In [6]:
from sklearn.metrics import classification_report

# 테스트 데이터 예측
y_true = test_data[target_col]
y_pred = predictor.predict(test_data)

print(classification_report(y_true, y_pred, digits=3))

              precision    recall  f1-score   support

           1      0.555     0.554     0.554      2127
           2      0.638     0.207     0.313      2008
           3      0.557     0.615     0.584      2257
           4      0.590     0.895     0.711      2427
           5      0.719     0.660     0.688      1516

    accuracy                          0.595     10335
   macro avg      0.612     0.586     0.570     10335
weighted avg      0.604     0.595     0.570     10335



In [33]:
# 예측용 샘플 생성
sample = pd.DataFrame([{
    '시군구': '종로구',
    '사고유형': '차대사람 - 차도통행중',
    '도로형태': '교차로 - 교차로부근'
}])

# 누락된 feature 추가
sample['시군구_평균사고'] = sample['시군구'].map(avg_by_area)
sample['사고도로조합'] = sample['사고유형'] + '_' + sample['도로형태']

# 예측
pred = predictor.predict(sample)
pred

0    2
Name: 위험도, dtype: object

In [None]:
# df = pd.read_csv('../..//시도_시군구별_보행자_사고_사고유형_전처리ver.csv')

# # 피해 가중 점수 계산
# df['피해가중점수'] = (
#     df['사망자수'] * 1 +
#     df['중상자수'] * 0.1 +
#     df['경상자수'] * 0.01 +
#     df['부상신고자수'] * 0.001
# )

# # 사고유형 가중치
# type_weight = df.groupby('사고유형')['피해가중점수'].mean()
# df['사고유형가중치'] = df['사고유형'].map(type_weight)

# # 도로형태 가중치
# road_weight = df.groupby('도로형태')['피해가중점수'].mean()
# df['도로형태가중치'] = df['도로형태'].map(road_weight)

# # 최종 위험 점수
# df['최종위험점수'] = df['피해가중점수'] + 0.5 * (df['사고유형가중치'] + df['도로형태가중치'])

# # 위험도 등급 (1=안전, 5=위험)
# df['위험도'] = pd.qcut(df['최종위험점수'], q=5, labels=[1, 2, 3, 4, 5]).astype(str)

# # 시군구별 평균 피해가중점수
# area_avg = df.groupby('시군구')['피해가중점수'].mean().to_dict()
# df['시군구_평균사고'] = df['시군구'].map(area_avg)

# # 전체 평균과의 편차
# overall_avg = df['피해가중점수'].mean()
# df['시군구_편차'] = df['시군구_평균사고'] - overall_avg

# # 조합 변수 생성
# df['사고도로조합'] = df['사고유형'] + '_' + df['도로형태']
# df['시군구_사고유형'] = df['시군구'] + '_' + df['사고유형']
# df['시군구_도로형태'] = df['시군구'] + '_' + df['도로형태']
# HTML(df.head(5).to_html(index=False))

연도,시군구,법정동코드,사고내용,사망자수,중상자수,경상자수,부상신고자수,사고유형,도로형태,피해자 상해정도,피해가중점수,사고유형가중치,도로형태가중치,최종위험점수,위험도,시군구_평균사고,시군구_편차,사고도로조합,시군구_사고유형,시군구_도로형태
2020,서울특별시 종로구 창성동,1111010500,중상사고,0,1,0,0,차대사람 - 차도통행중,교차로 - 교차로부근,중상,0.1,0.131882,0.111938,0.22191,4,0.1,-0.015843,차대사람 - 차도통행중_교차로 - 교차로부근,서울특별시 종로구 창성동_차대사람 - 차도통행중,서울특별시 종로구 창성동_교차로 - 교차로부근
2020,서울특별시 종로구 창성동,1111010500,중상사고,0,1,0,0,차대사람 - 차도통행중,교차로 - 교차로안,중상,0.1,0.131882,0.117951,0.224917,4,0.1,-0.015843,차대사람 - 차도통행중_교차로 - 교차로안,서울특별시 종로구 창성동_차대사람 - 차도통행중,서울특별시 종로구 창성동_교차로 - 교차로안
2020,서울특별시 종로구 통인동,1111010800,경상사고,0,0,1,0,차대사람 - 기타,단일로 - 기타,경상,0.01,0.092425,0.121204,0.116814,1,0.01,-0.105843,차대사람 - 기타_단일로 - 기타,서울특별시 종로구 통인동_차대사람 - 기타,서울특별시 종로구 통인동_단일로 - 기타
2020,서울특별시 종로구 누상동,1111010900,경상사고,0,0,1,0,차대사람 - 차도통행중,교차로 - 교차로안,경상,0.01,0.131882,0.117951,0.134917,2,0.01,-0.105843,차대사람 - 차도통행중_교차로 - 교차로안,서울특별시 종로구 누상동_차대사람 - 차도통행중,서울특별시 종로구 누상동_교차로 - 교차로안
2020,서울특별시 종로구 누상동,1111010900,경상사고,0,0,1,0,차대사람 - 기타,단일로 - 기타,경상,0.01,0.092425,0.121204,0.116814,1,0.01,-0.105843,차대사람 - 기타_단일로 - 기타,서울특별시 종로구 누상동_차대사람 - 기타,서울특별시 종로구 누상동_단일로 - 기타


In [None]:
# # Feature 목록
# feature_cols = [
#     '시군구', '사고유형', '도로형태',
#     '시군구_평균사고', '시군구_편차',
#     '사고도로조합', '시군구_사고유형', '시군구_도로형태'
# ]
# target_col = '위험도'

# # 결측치 제거
# df_model = df[feature_cols + [target_col]].dropna()

# # 학습/테스트 분리
# train_data, test_data = train_test_split(
#     df_model, test_size=0.2, random_state=42, stratify=df_model[target_col]
# )

# # AutoGluon 학습
# predictor = TabularPredictor(label=target_col, path='MyModel_2', problem_type='multiclass').fit(
#     train_data=train_data,
#     presets='high_quality',  # 성능 우선
#     time_limit=900,          # 15분 제한
#     num_cpus=10,
#     verbosity=2
# )

# # 평가
# predictor.evaluate(test_data)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.4.0: Fri Apr 11 18:33:39 PDT 2025; root:xnu-11417.101.15~117/RELEASE_ARM64_T6020
CPU Count:          12
Memory Avail:       2.66 GB / 16.00 GB (16.6%)
Disk Space Avail:   6.31 GB / 460.43 GB (1.4%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Presets specified: ['high_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model refit if memory is small relative to the data size.
	You can avoid this risk by setting `save_bag_folds=True`.
DyS

{'accuracy': 0.5929366231253024,
 'balanced_accuracy': 0.5834618133963303,
 'mcc': 0.494717728607096}

In [None]:
# leaderboard = predictor.leaderboard(silent=True)
# print(leaderboard)

                           model  score_val eval_metric  pred_time_val  \
0                CatBoost_BAG_L1   0.598389    accuracy       0.284616   
1            WeightedEnsemble_L2   0.598389    accuracy       0.287621   
2           CatBoost_r177_BAG_L1   0.596139    accuracy       0.239805   
3                 XGBoost_BAG_L1   0.585519    accuracy       0.467451   
4              LightGBMXT_BAG_L1   0.583027    accuracy       0.447050   
5                LightGBM_BAG_L1   0.577753    accuracy       0.204574   
6         NeuralNetFastAI_BAG_L1   0.576785    accuracy       0.340838   
7           LightGBMLarge_BAG_L1   0.576229    accuracy       0.226127   
8        RandomForestGini_BAG_L1   0.523176    accuracy       4.196429   
9        RandomForestEntr_BAG_L1   0.523055    accuracy       3.873621   
10         ExtraTreesGini_BAG_L1   0.515943    accuracy       3.264198   
11         ExtraTreesEntr_BAG_L1   0.514926    accuracy       3.352923   
12         KNeighborsUnif_BAG_L1   0.4

In [None]:
# 1차 결과 (위험점수 기반)
# 해석 가능성:       높음 (정량적 피해 반영)
# 일반화 가능성:     높음 (새 지역에도 적용 가능)
# 모델 복잡도:       낮음 (수치형 중심)
# 성능:            소폭 우세

# 2차 결과 (조합 기반)
# 해석 가능성:       낮음 (복합조합 가중치 해석 어려움)
# 일반화 가능성:      낮음 (훈련된 조합 정보 필요)
# 모델 복잡도:       높음 (카테고리 피처 다수)
# 성능:            근소 열세

# =======================================================

# 사용한 feature

# 피해가중점수 (사망자수, 중상자수, 경상자수, 부상신고자수 반영)
# 사고유형가중치
# 도로형태가중치
# =======================================================

# 최종위험점수 = 피해가중점수 + 0.5 × (사고유형가중치 + 도로형태가중치)

# =======================================================
# 예측 성능 지표

# Accuracy: 0.595
# Balanced Accuracy: 0.586
# MCC (Matthews Correlation Coefficient): 0.498

# =======================================================

# 모델 성능
# Top model: CatBoost_BAG_L1 + WeightedEnsemble_L2
# 최고 성능: Accuracy 0.598