In [1]:
# 범주형 - 범주형 구조에 대한 데이터 분석
# 사고유형과 가해자상해정도
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.preprocessing import LabelEncoder


print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('seaborn version:', sns.__version__)
print(f"matplotlib: mpl {plt.matplotlib.__version__}")
print('sklearn version:', sklearn.__version__)

font_path = "c:/Windows/Fonts/malgun.ttf"
font_prop = mpl.font_manager.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = font_prop.get_name()
mpl.rcParams['axes.unicode_minus'] = False

numpy version: 2.1.0
pandas version: 2.2.2
seaborn version: 0.13.2
matplotlib: mpl 3.9.2
sklearn version: 1.5.1


In [69]:
data = pd.read_csv("all_in_data.csv")
data.head()

Unnamed: 0,요일_월요일,요일_화요일,요일_수요일,요일_목요일,요일_금요일,요일_토요일,요일_일요일,년,월,일,...,기상상태_안개,기상상태_흐림,도로형태_교차로,도로형태_단일로,도로형태_주차장,차종_보행자,차종_승용,차종_이륜,차종_자전거,차종_화물
0,False,False,True,False,False,False,False,2014,1,1,...,False,False,False,True,False,False,True,False,False,False
1,False,False,True,False,False,False,False,2014,1,1,...,False,False,False,True,False,False,True,False,False,False
2,False,False,True,False,False,False,False,2014,1,1,...,False,False,False,True,False,False,True,False,False,False
3,False,False,True,False,False,False,False,2014,1,1,...,False,False,False,True,False,False,True,False,False,False
4,False,False,True,False,False,False,False,2014,1,1,...,False,False,False,True,False,False,True,False,False,False


In [70]:
data.columns

Index(['요일_월요일', '요일_화요일', '요일_수요일', '요일_목요일', '요일_금요일', '요일_토요일', '요일_일요일',
       '년', '월', '일', '시간', '대상', '상해정도', '연령', '사고유형_차대사람', '사고유형_차대차',
       '사고유형_차량단독', '노면상태_건조', '노면상태_결빙', '노면상태_기타', '노면상태_서리/결빙', '노면상태_습기',
       '노면상태_적설', '노면상태_젖음/습기', '기상상태_기타', '기상상태_눈', '기상상태_맑음', '기상상태_비',
       '기상상태_안개', '기상상태_흐림', '도로형태_교차로', '도로형태_단일로', '도로형태_주차장', '차종_보행자',
       '차종_승용', '차종_이륜', '차종_자전거', '차종_화물'],
      dtype='object')

In [71]:
features = ['대상', '사고유형_차대사람', '사고유형_차대차', '사고유형_차량단독', 
            '차종_보행자', '차종_승용', '차종_이륜', '차종_자전거', 
            '차종_화물', '연령', '상해정도']

data=data[features]

data_cleaned = data.dropna(subset=['상해정도', '연령'])
y = data_cleaned['상해정도']
X = data_cleaned.drop('상해정도', axis=1)

# X와 y를 확인
print(X.head())
print(y.head())

nan_count = y.isna().sum()
print(nan_count)

   대상  사고유형_차대사람  사고유형_차대차  사고유형_차량단독  차종_보행자  차종_승용  차종_이륜  차종_자전거  차종_화물  \
0   1      False      True      False   False   True  False   False  False   
1   0      False      True      False   False   True  False   False  False   
2   1      False      True      False   False   True  False   False  False   
3   0      False      True      False   False   True  False   False  False   
4   1      False      True      False   False   True  False   False  False   

     연령  
0  31.0  
1  65.0  
2  32.0  
3  54.0  
4  26.0  
0    0.0
1    1.0
2    0.0
3    3.0
4    0.0
Name: 상해정도, dtype: float64
0


In [72]:
# 마지막 평가를 위한 test 나누기
# 데이터를 학습용과 테스트용으로 나눔 (80% 학습, 20% 테스트)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [90]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost 모델 초기화
xgb = XGBClassifier(random_state=42, eval_metric='logloss')

# 파라미터 그리드 설정
param_grid = {
    'n_estimators': [150],
    'learning_rate': [0.01],
    'max_depth': [10],
    'min_child_weight': [5],
    'colsample_bynode': [1.0],
    'grow_policy': ['lossguide'],
    'reg_lambda': [1.0]
}

# GridSearchCV 설정
gs = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# 모델 학습
gs.fit(X_train_scaled, y_train)

# 최적의 파라미터와 성능 출력
print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

# 최적 모델로 예측 수행
best_xgb = gs.best_estimator_
y_test_pred = best_xgb.predict(X_test_scaled)

# 성능 평가
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_xgb, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

f1 = f1_score(y_test, y_test_pred, average='weighted') 
print(f"Test Set F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'colsample_bynode': 1.0, 'grow_policy': 'lossguide', 'learning_rate': 0.01, 'max_depth': 10, 'min_child_weight': 5, 'n_estimators': 150, 'reg_lambda': 1.0}
Best Cross-Validation Accuracy: 0.7991
Test Set Accuracy: 0.8020
Cross-Validation Scores: [0.79949272 0.79949272 0.80376452 0.7958884  0.79695635]
Mean Cross-Validation Accuracy: 0.7991
Test Set F1 Score: 0.7624

Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.91      0.92      4618
         1.0       0.27      0.05      0.09       301
         2.0       0.70      0.94      0.80      3395
         3.0       0.45      0.08      0.14      1013
         4.0       0.00      0.00      0.00        37

    accuracy                           0.80      9364
   macro avg       0.47      0.40      0.39      9364
weighted avg       0.77      0.80      0.76      9364



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [109]:
#smote(오버샘플링)으로 추가 실험
from sklearn.model_selection import GridSearchCV, cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# 각 클래스별 샘플 수를 설정 (예: 클래스 0: 5000, 클래스 1: 1000, 클래스 2: 2000, 클래스 3: 1500)
sampling_strategy = {0: 20000, 1: 20000, 2: 20000, 3: 20000, 4: 2000 }

# SMOTE 적용
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# XGBoost 모델 정의
xgb = XGBClassifier(random_state=42, eval_metric='logloss')

# 파라미터 그리드 정의
param_grid = {
    'n_estimators': [150],
    'learning_rate': [0.01],
    'max_depth': [10],
    'min_child_weight': [5],
    'colsample_bynode': [1.0],
    'grow_policy': ['lossguide'],
    'reg_lambda': [1.0]
}

# GridSearchCV 설정
gs = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# 모델 학습
gs.fit(X_train_resampled, y_train_resampled)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_xgb = gs.best_estimator_
y_test_pred = best_xgb.predict(X_test_scaled)

# 테스트 데이터 평가
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# 교차 검증 점수 계산
cross_val_scores = cross_val_score(best_xgb, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

# F1 점수 계산
f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test Set F1 Score: {f1:.4f}")

# 분류 리포트 출력
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# 결과를 CSV 파일로 저장
results_file = '실험일지.csv'

try:
    # 기존 데이터프레임 읽기 (파일이 존재하지 않을 경우 빈 데이터프레임 생성)
    results_df = pd.read_csv(results_file)
except FileNotFoundError:
    results_df = pd.DataFrame(columns=[
        'Experiment', 'Train Size', 'Test Size', 
        'Hyperparameters', 'Best Cross-Validation Accuracy', 
        'Test Set Accuracy', 'Mean Cross-Validation Accuracy', 
        'Test Set F1 Score'
    ])

# 데이터 수
train_size = len(X_train_scaled)
test_size = len(X_test_scaled)

# 하이퍼파라미터 설정을 문자열로 변환
hyperparameters = str(param_grid)

# 성능 지표 계산
best_cv_accuracy = gs.best_score_
test_accuracy = accuracy_score(y_test, y_test_pred)
mean_cv_accuracy = cross_val_scores.mean()
f1_score_test = f1_score(y_test, y_test_pred, average='weighted')

# 실험 결과를 데이터프레임으로 변환
new_results_df = pd.DataFrame([{
    'Experiment': 'XGBoost with GridSearchCV',
    'Train Size': train_size,
    'Test Size': test_size,
    'Hyperparameters': hyperparameters,
    'Best Cross-Validation Accuracy': best_cv_accuracy,
    'Test Set Accuracy': test_accuracy,
    'Mean Cross-Validation Accuracy': mean_cv_accuracy,
    'Test Set F1 Score': f1_score_test
}])

# 기존 데이터프레임과 새로운 데이터프레임을 결합
results_df = pd.concat([results_df, new_results_df], ignore_index=True)

# 결과 데이터프레임을 CSV 파일로 저장
results_df.to_csv(results_file, index=False)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'colsample_bynode': 1.0, 'grow_policy': 'lossguide', 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 5, 'n_estimators': 150, 'reg_lambda': 1.0}
Best Cross-Validation Accuracy: 0.6106
Test Set Accuracy: 0.7696
Cross-Validation Scores: [0.60676829 0.60621951 0.6102439  0.61121951 0.61878049]
Mean Cross-Validation Accuracy: 0.6106
Test Set F1 Score: 0.7737

Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.88      0.92      4618
         1.0       0.21      0.38      0.27       301
         2.0       0.73      0.81      0.77      3395
         3.0       0.35      0.28      0.31      1013
         4.0       0.00      0.00      0.00        37

    accuracy                           0.77      9364
   macro avg       0.45      0.47      0.45      9364
weighted avg       0.78      0.77      0.77      9364



In [111]:
# 각 클래스별 샘플 수를 설정 (예: 클래스 0: 5000, 클래스 1: 1000, 클래스 2: 2000, 클래스 3: 1500)
sampling_strategy = {0: 20000, 1: 20000, 2: 20000, 3: 20000, 4: 2000 }

# SMOTE 적용
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# XGBoost 모델 정의
xgb = XGBClassifier(random_state=42, eval_metric='logloss')

# 파라미터 그리드 정의
param_grid = {
    'n_estimators': [200,100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3,5],
    'min_child_weight': [3],
    'colsample_bynode': [1.0],
    'grow_policy': ['lossguide'],
    'reg_lambda': [1.0]
}

# GridSearchCV 설정
gs = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# 모델 학습
gs.fit(X_train_resampled, y_train_resampled)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_xgb = gs.best_estimator_
y_test_pred = best_xgb.predict(X_test_scaled)

# 테스트 데이터 평가
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# 교차 검증 점수 계산
cross_val_scores = cross_val_score(best_xgb, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

# F1 점수 계산
f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test Set F1 Score: {f1:.4f}")

# 분류 리포트 출력
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

# 결과를 CSV 파일로 저장
results_file = '실험일지.csv'

try:
    # 기존 데이터프레임 읽기 (파일이 존재하지 않을 경우 빈 데이터프레임 생성)
    results_df = pd.read_csv(results_file)
except FileNotFoundError:
    results_df = pd.DataFrame(columns=[
        'Experiment', 'Train Size', 'Test Size', 
        'Hyperparameters', 'Best Cross-Validation Accuracy', 
        'Test Set Accuracy', 'Mean Cross-Validation Accuracy', 
        'Test Set F1 Score'
    ])

# 데이터 수
train_size = len(X_train_scaled)
test_size = len(X_test_scaled)

# 하이퍼파라미터 설정을 문자열로 변환
hyperparameters = str(param_grid)

# 성능 지표 계산
best_cv_accuracy = gs.best_score_
test_accuracy = accuracy_score(y_test, y_test_pred)
mean_cv_accuracy = cross_val_scores.mean()
f1_score_test = f1_score(y_test, y_test_pred, average='weighted')

# 실험 결과를 데이터프레임으로 변환
new_results_df = pd.DataFrame([{
    'Experiment': 'XGBoost with GridSearchCV',
    'Train Size': train_size,
    'Test Size': test_size,
    'Hyperparameters': hyperparameters,
    'Best Cross-Validation Accuracy': best_cv_accuracy,
    'Test Set Accuracy': test_accuracy,
    'Mean Cross-Validation Accuracy': mean_cv_accuracy,
    'Test Set F1 Score': f1_score_test
}])

# 기존 데이터프레임과 새로운 데이터프레임을 결합
results_df = pd.concat([results_df, new_results_df], ignore_index=True)

# 결과 데이터프레임을 CSV 파일로 저장
results_df.to_csv(results_file, index=False)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'colsample_bynode': 1.0, 'grow_policy': 'lossguide', 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 200, 'reg_lambda': 1.0}
Best Cross-Validation Accuracy: 0.5945
Test Set Accuracy: 0.7687
Cross-Validation Scores: [0.59463415 0.59280488 0.59384146 0.59432927 0.59682927]
Mean Cross-Validation Accuracy: 0.5945
Test Set F1 Score: 0.7746

Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.87      0.92      4618
         1.0       0.20      0.45      0.28       301
         2.0       0.73      0.82      0.77      3395
         3.0       0.38      0.28      0.32      1013
         4.0       0.00      0.00      0.00        37

    accuracy                           0.77      9364
   macro avg       0.45      0.48      0.46      9364
weighted avg       0.79      0.77      0.77      9364



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
