In [1]:
# 범주형 - 수치형 구조에 대한 데이터 분석

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.preprocessing import LabelEncoder


print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('seaborn version:', sns.__version__)
print(f"matplotlib: mpl {plt.matplotlib.__version__}")
print('sklearn version:', sklearn.__version__)

font_path = "c:/Windows/Fonts/malgun.ttf"
font_prop = mpl.font_manager.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = font_prop.get_name()
mpl.rcParams['axes.unicode_minus'] = False

numpy version: 2.1.0
pandas version: 2.2.2
seaborn version: 0.13.2
matplotlib: mpl 3.9.2
sklearn version: 1.5.1


In [2]:
accident = pd.read_csv('data.csv.csv')

In [3]:
accident.head()

Unnamed: 0,가해운전자 연령,가해운전자 상해정도,피해운전자 연령,피해운전자 상해정도,요일_금요일,요일_목요일,요일_수요일,요일_월요일,요일_일요일,요일_토요일,...,도로형태_주차장,가해운전자 차종_승용,가해운전자 차종_이륜,가해운전자 차종_자전거,가해운전자 차종_화물,피해운전자 차종_보행자,피해운전자 차종_승용,피해운전자 차종_이륜,피해운전자 차종_자전거,피해운전자 차종_화물
0,31,3,65.0,1,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
1,32,3,54.0,4,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
2,26,3,26.0,1,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
3,29,3,25.0,0,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
4,42,3,37.0,0,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False


In [13]:
accident.drop(['피해운전자 상해정도', '피해운전자 연령'], axis=1)

Unnamed: 0,가해운전자 연령,가해운전자 상해정도,요일_금요일,요일_목요일,요일_수요일,요일_월요일,요일_일요일,요일_토요일,요일_화요일,년,...,도로형태_주차장,가해운전자 차종_승용,가해운전자 차종_이륜,가해운전자 차종_자전거,가해운전자 차종_화물,피해운전자 차종_보행자,피해운전자 차종_승용,피해운전자 차종_이륜,피해운전자 차종_자전거,피해운전자 차종_화물
0,31,3,False,False,True,False,False,False,False,2014,...,False,True,False,False,False,False,True,False,False,False
1,32,3,False,False,True,False,False,False,False,2014,...,False,True,False,False,False,False,True,False,False,False
2,26,3,False,False,True,False,False,False,False,2014,...,False,True,False,False,False,False,True,False,False,False
3,29,3,False,False,True,False,False,False,False,2014,...,False,True,False,False,False,False,True,False,False,False
4,42,3,False,False,True,False,False,False,False,2014,...,False,True,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24042,31,3,False,False,False,False,False,True,False,2023,...,False,True,False,False,False,True,False,False,False,False
24043,63,3,False,False,False,False,False,True,False,2023,...,False,True,False,False,False,False,False,False,False,True
24044,27,3,False,False,False,False,False,True,False,2023,...,False,True,False,False,False,False,True,False,False,False
24045,48,3,False,False,False,False,True,False,False,2023,...,False,True,False,False,False,False,True,False,False,False


In [14]:
accident.rename(columns={'가해운전자 상해정도': '가해운전자_상해정도'}, inplace=True)

In [15]:
X = accident.drop('가해운전자_상해정도', axis=1)
y = accident['가해운전자_상해정도']

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 'pf'는 데이터프레임, 'target'은 예측하려는 다중분류 대상 열이라고 가정

# 데이터를 학습용과 테스트용으로 나눔 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GradientBoostingClassifier 모델 초기화
gbc = GradientBoostingClassifier() 

# 모델 학습
gbc.fit(X_train, y_train)

# 테스트 데이터에 대해 예측
y_pred = gbc.predict(X_test)

# 모델의 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9385


In [17]:
from sklearn.model_selection import cross_val_score

# 5-폴드 교차 검증으로 모델 성능 평가
cv_scores = cross_val_score(gbc, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f}")


Cross-validation scores: [0.63014553 0.93035343 0.91016843 0.93532959 0.93553753]
Mean CV Accuracy: 0.8683


In [20]:
gbc = GradientBoostingClassifier()

from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [25, 50, 100 ,200],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 5, 7],
    'subsample' : [0.5, 0.7, 1],
    'min_samples_split' : [2, 4, 6, 8]
}

grid_search = GridSearchCV(gbc, param_grid=params, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_split': 8, 'n_estimators': 200, 'subsample': 0.7}


In [15]:
params = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 5],
    'subsample' : [0.5, 0.7, 0.8],
    'min_samples_split' : [2, 4, 6, 8]
}

In [21]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(gbc, param_grid=params, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")


KeyboardInterrupt: 

In [23]:
# Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_split': 8, 'n_estimators': 200, 'subsample': 0.7}


gbc = GradientBoostingClassifier()

from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [200],
    'learning_rate': [0.01],
    'max_depth': [5],
    'subsample' : [0.7],
    'min_samples_split' : [8]
}

grid_search = GridSearchCV(gbc, param_grid=params, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")


Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_split': 8, 'n_estimators': 200, 'subsample': 0.7}


In [24]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred)}")


Test Accuracy: 0.9395010395010395
