### 전처리 하나 안 된 데이터로 해보기

In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn

print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('seaborn version:', sns.__version__)
print(f"matplotlib: mpl {plt.matplotlib.__version__}")
print('sklearn version:', sklearn.__version__)

font_path = "c:/Windows/Fonts/malgun.ttf"
font_prop = mpl.font_manager.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = font_prop.get_name()
mpl.rcParams['axes.unicode_minus'] = False

numpy version: 2.1.0
pandas version: 2.2.2
seaborn version: 0.13.2
matplotlib: mpl 3.9.2
sklearn version: 1.5.1


In [2]:
data = pd.read_csv("label_data.csv")
data.head()

Unnamed: 0,가해운전자 연령,가해운전자 상해정도,피해운전자 연령,피해운전자 상해정도,요일_금요일,요일_목요일,요일_수요일,요일_월요일,요일_일요일,요일_토요일,...,도로형태_주차장,가해운전자 차종_승용,가해운전자 차종_이륜,가해운전자 차종_자전거,가해운전자 차종_화물,피해운전자 차종_보행자,피해운전자 차종_승용,피해운전자 차종_이륜,피해운전자 차종_자전거,피해운전자 차종_화물
0,31,0,65.0,1,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
1,32,0,54.0,3,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
2,26,0,26.0,1,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
3,29,0,25.0,2,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
4,42,0,37.0,2,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24047 entries, 0 to 24046
Data columns (total 41 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   가해운전자 연령      24047 non-null  int64  
 1   가해운전자 상해정도    24047 non-null  int64  
 2   피해운전자 연령      24047 non-null  float64
 3   피해운전자 상해정도    24047 non-null  int64  
 4   요일_금요일        24047 non-null  bool   
 5   요일_목요일        24047 non-null  bool   
 6   요일_수요일        24047 non-null  bool   
 7   요일_월요일        24047 non-null  bool   
 8   요일_일요일        24047 non-null  bool   
 9   요일_토요일        24047 non-null  bool   
 10  요일_화요일        24047 non-null  bool   
 11  년             24047 non-null  int64  
 12  월             24047 non-null  int64  
 13  일             24047 non-null  int64  
 14  시간            24047 non-null  int64  
 15  사고유형_차대차      24047 non-null  bool   
 16  사고유형_차량단독     24047 non-null  bool   
 17  노면상태_건조       24047 non-null  bool   
 18  노면상태_결빙       24047 non-nu

In [4]:
target = data['피해운전자 상해정도']
train = data.drop('피해운전자 상해정도', axis=1)

In [5]:
train.head()

Unnamed: 0,가해운전자 연령,가해운전자 상해정도,피해운전자 연령,요일_금요일,요일_목요일,요일_수요일,요일_월요일,요일_일요일,요일_토요일,요일_화요일,...,도로형태_주차장,가해운전자 차종_승용,가해운전자 차종_이륜,가해운전자 차종_자전거,가해운전자 차종_화물,피해운전자 차종_보행자,피해운전자 차종_승용,피해운전자 차종_이륜,피해운전자 차종_자전거,피해운전자 차종_화물
0,31,0,65.0,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
1,32,0,54.0,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
2,26,0,26.0,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
3,29,0,25.0,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
4,42,0,37.0,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False


In [6]:
# 마지막 평가를 위한 test 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, stratify=target, random_state=42)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [199, 200],
    'min_samples_split': [9, 10, 11],
    'min_samples_leaf': [3, 4, 5, 6], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train_scaled, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_
y_test_pred = best_rf.predict(X_test_scaled)


test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

f1 = f1_score(y_test, y_test_pred, average='weighted')  # 다중 클래스인 경우 'weighted'를 사용
print(f"Test Set F1 Score (Weighted): {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 9, 'n_estimators': 199}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7314
Test Set F1 Score (Weighted): 0.6728

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.42      0.55       391
           1       1.00      0.35      0.52       196
           2       0.73      0.97      0.83      3257
           3       0.60      0.13      0.22       937
           4       1.00      0.07      0.13        29

    accuracy                           0.73      4810
   macro avg       0.82      0.39      0.45      4810
weighted avg       0.72      0.73      0.67      4810



In [8]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42, eval_metric='logloss')

param_grid = {
    'n_estimators': [98, 100, 102],
    'learning_rate': [0.05,0.12, 0.13],
    'max_depth': [2, 3, 4],
    'min_child_weight': [3, 4, 5],
}

gs = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train_scaled, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_xgb = gs.best_estimator_
y_test_pred = best_xgb.predict(X_test_scaled)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_xgb, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

f1 = f1_score(y_test, y_test_pred, average='weighted') 
print(f"Test Set F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'learning_rate': 0.12, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 98}
Best Cross-Validation Accuracy: 0.7412
Test Set Accuracy: 0.7333
Cross-Validation Scores: [0.74350312 0.74818087 0.73485833 0.74187679 0.7377177 ]
Mean Cross-Validation Accuracy: 0.7412
Test Set F1 Score: 0.6755

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.42      0.56       391
           1       1.00      0.35      0.52       196
           2       0.73      0.97      0.83      3257
           3       0.61      0.13      0.21       937
           4       0.92      0.41      0.57        29

    accuracy                           0.73      4810
   macro avg       0.82      0.46      0.54      4810
weighted avg       0.73      0.73      0.68      4810



In [9]:
xgb = XGBClassifier(random_state=42, eval_metric='logloss')

param_grid = {
    'n_estimators': [99, 100, 101],
    'learning_rate': [0.001,0.11, 0.13],
    'max_depth': [2, 3, 4],
    'min_child_weight': [3, 4, 5],
}

gs = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train_scaled, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_xgb = gs.best_estimator_
y_test_pred = best_xgb.predict(X_test_scaled)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_xgb, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

f1 = f1_score(y_test, y_test_pred, average='weighted') 
print(f"Test Set F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'learning_rate': 0.11, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 99}
Best Cross-Validation Accuracy: 0.7414
Test Set Accuracy: 0.7333
Cross-Validation Scores: [0.74376299 0.74870062 0.73615805 0.74005719 0.73823759]
Mean Cross-Validation Accuracy: 0.7414
Test Set F1 Score: 0.6746

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.42      0.56       391
           1       1.00      0.35      0.52       196
           2       0.73      0.97      0.83      3257
           3       0.63      0.12      0.21       937
           4       0.92      0.41      0.57        29

    accuracy                           0.73      4810
   macro avg       0.82      0.46      0.54      4810
weighted avg       0.73      0.73      0.67      4810



In [10]:
xgb = XGBClassifier(random_state=42, eval_metric='logloss')

param_grid = {
    'n_estimators': [98, 99, 100],
    'learning_rate': [0.11,0.111, 0.112],
    'max_depth': [2, 3, 4],
    'min_child_weight': [3, 4, 5],
}

gs = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train_scaled, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_xgb = gs.best_estimator_
y_test_pred = best_xgb.predict(X_test_scaled)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_xgb, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

f1 = f1_score(y_test, y_test_pred, average='weighted') 
print(f"Test Set F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'learning_rate': 0.11, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 98}
Best Cross-Validation Accuracy: 0.7414
Test Set Accuracy: 0.7337
Cross-Validation Scores: [0.74402287 0.74870062 0.73615805 0.74031713 0.73797764]
Mean Cross-Validation Accuracy: 0.7414
Test Set F1 Score: 0.6749

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.42      0.56       391
           1       1.00      0.35      0.52       196
           2       0.73      0.97      0.83      3257
           3       0.63      0.12      0.21       937
           4       0.92      0.41      0.57        29

    accuracy                           0.73      4810
   macro avg       0.82      0.46      0.54      4810
weighted avg       0.73      0.73      0.67      4810



In [11]:
xgb = XGBClassifier(random_state=42, eval_metric='logloss')

param_grid = {
    'n_estimators': [97, 98, 99],
    'learning_rate': [0.11, 0.111, 0.112],
    'max_depth': [2, 3, 4, 5],
    'min_child_weight': [3, 4, 5, 6],
}

gs = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train_scaled, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_xgb = gs.best_estimator_
y_test_pred = best_xgb.predict(X_test_scaled)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_xgb, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

f1 = f1_score(y_test, y_test_pred, average='weighted') 
print(f"Test Set F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Parameters: {'learning_rate': 0.11, 'max_depth': 3, 'min_child_weight': 6, 'n_estimators': 97}
Best Cross-Validation Accuracy: 0.7416
Test Set Accuracy: 0.7341
Cross-Validation Scores: [0.74350312 0.7489605  0.73693787 0.74031713 0.73849753]
Mean Cross-Validation Accuracy: 0.7416
Test Set F1 Score: 0.6758

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.43      0.56       391
           1       1.00      0.35      0.52       196
           2       0.73      0.97      0.83      3257
           3       0.64      0.13      0.21       937
           4       0.92      0.41      0.57        29

    accuracy                           0.73      4810
   macro avg       0.82      0.46      0.54      4810
weighted avg       0.73      0.73      0.68      4810

