In [1]:
# 범주형 - 범주형 구조에 대한 데이터 분석
# 사고유형과 가해자상해정도
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.preprocessing import LabelEncoder


print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('seaborn version:', sns.__version__)
print(f"matplotlib: mpl {plt.matplotlib.__version__}")
print('sklearn version:', sklearn.__version__)

font_path = "c:/Windows/Fonts/malgun.ttf"
font_prop = mpl.font_manager.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = font_prop.get_name()
mpl.rcParams['axes.unicode_minus'] = False

numpy version: 2.1.0
pandas version: 2.2.2
seaborn version: 0.13.2
matplotlib: mpl 3.9.2
sklearn version: 1.5.1


In [2]:
data = pd.read_csv("all_in_data.csv")
data.head()

Unnamed: 0,요일_월요일,요일_화요일,요일_수요일,요일_목요일,요일_금요일,요일_토요일,요일_일요일,년,월,일,...,기상상태_안개,기상상태_흐림,도로형태_교차로,도로형태_단일로,도로형태_주차장,차종_보행자,차종_승용,차종_이륜,차종_자전거,차종_화물
0,False,False,True,False,False,False,False,2014,1,1,...,False,False,False,True,False,False,True,False,False,False
1,False,False,True,False,False,False,False,2014,1,1,...,False,False,False,True,False,False,True,False,False,False
2,False,False,True,False,False,False,False,2014,1,1,...,False,False,False,True,False,False,True,False,False,False
3,False,False,True,False,False,False,False,2014,1,1,...,False,False,False,True,False,False,True,False,False,False
4,False,False,True,False,False,False,False,2014,1,1,...,False,False,False,True,False,False,True,False,False,False


In [3]:
features = ['대상', '사고유형_차대사람', '사고유형_차대차', '사고유형_차량단독', 
            '차종_보행자', '차종_승용', '차종_이륜', '차종_자전거', 
            '차종_화물', '연령', '상해정도']

data=data[features]

data_cleaned = data.dropna(subset=['상해정도', '연령'])
y = data_cleaned['상해정도']
X = data_cleaned.drop('상해정도', axis=1)

# X와 y를 확인
print(X.head())
print(y.head())

nan_count = y.isna().sum()
print(nan_count)

   대상  사고유형_차대사람  사고유형_차대차  사고유형_차량단독  차종_보행자  차종_승용  차종_이륜  차종_자전거  차종_화물  \
0   1      False      True      False   False   True  False   False  False   
1   0      False      True      False   False   True  False   False  False   
2   1      False      True      False   False   True  False   False  False   
3   0      False      True      False   False   True  False   False  False   
4   1      False      True      False   False   True  False   False  False   

     연령  
0  31.0  
1  65.0  
2  32.0  
3  54.0  
4  26.0  
0    0.0
1    1.0
2    0.0
3    3.0
4    0.0
Name: 상해정도, dtype: float64
0


In [4]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# 예시 데이터 로딩 및 전처리
# X, y는 각각 피쳐와 레이블을 포함하는 DataFrame입니다.
# X = ... (피쳐 데이터)
# y = ... (타겟 레이블)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# SMOTE를 사용하여 클래스 불균형 문제 해결
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 데이터 스케일링
scaler = StandardScaler()

# LightGBM 모델 설정
lgbm = lgb.LGBMClassifier(random_state=42)

# 하이퍼파라미터 그리드 설정
param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 150],
    'class_weight': [None, 'balanced']
}

# GridSearchCV를 사용한 하이퍼파라미터 튜닝
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# 모델 학습
pipeline = Pipeline([
    ('scaler', scaler),
    ('classifier', grid_search)
])

pipeline.fit(X_train_resampled, y_train_resampled)

# 최적의 파라미터 및 성능 출력
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# 테스트 데이터로 예측 수행
y_test_pred = pipeline.predict(X_test)

# 성능 평가
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test Set F1 Score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))


Fitting 5 folds for each of 16 candidates, totalling 80 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002586 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 282
[LightGBM] [Info] Number of data points in the train set: 80815, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
Best Parameters: {'class_weight': 'balanced', 'learning_rate': 0.1, 'n_estimators': 150, 'num_leaves': 50}
Best Cross-Validation Accuracy: 0.6796
Test Set Accuracy: 0.7709
Test Set F1 Score: 0.7741

Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.89      0

In [None]:
print(f"Resampled dataset shape {Counter(y_resampled)}")

# 2D 시각화를 위해 PCA 사용
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
X_resampled_pca = pca.transform(X_resampled)

# 시각화
plt.figure(figsize=(10, 6))

# 원본 데이터 시각화
plt.subplot(1, 2, 1)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.7)
plt.title("Original Dataset")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")

# 오버샘플링된 데이터 시각화
plt.subplot(1, 2, 2)
plt.scatter(X_resampled_pca[:, 0], X_resampled_pca[:, 1], c=y_resampled, cmap='viridis', alpha=0.7)
plt.title("SMOTE Oversampled Dataset")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")

plt.tight_layout()
plt.show()