In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn

print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('seaborn version:', sns.__version__)
print(f"matplotlib: mpl {plt.matplotlib.__version__}")
print('sklearn version:', sklearn.__version__)

font_path = "c:/Windows/Fonts/malgun.ttf"
font_prop = mpl.font_manager.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = font_prop.get_name()
mpl.rcParams['axes.unicode_minus'] = False

numpy version: 2.0.1
pandas version: 2.2.2
seaborn version: 0.13.2
matplotlib: mpl 3.9.2
sklearn version: 1.5.1


In [2]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,가해운전자 연령,가해운전자 상해정도,피해운전자 연령,피해운전자 상해정도,요일_금요일,요일_목요일,요일_수요일,요일_월요일,요일_일요일,요일_토요일,...,도로형태_주차장,가해운전자 차종_승용,가해운전자 차종_이륜,가해운전자 차종_자전거,가해운전자 차종_화물,피해운전자 차종_보행자,피해운전자 차종_승용,피해운전자 차종_이륜,피해운전자 차종_자전거,피해운전자 차종_화물
0,31,3,65.0,1,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
1,32,3,54.0,4,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
2,26,3,26.0,1,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
3,29,3,25.0,0,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False
4,42,3,37.0,0,False,False,True,False,False,False,...,False,True,False,False,False,False,True,False,False,False


피해운전자 상해정도 예측

In [3]:
target = data['피해운전자 상해정도']
train = data.drop('피해운전자 상해정도', axis=1)

In [4]:
train.head()

Unnamed: 0,가해운전자 연령,가해운전자 상해정도,피해운전자 연령,요일_금요일,요일_목요일,요일_수요일,요일_월요일,요일_일요일,요일_토요일,요일_화요일,...,도로형태_주차장,가해운전자 차종_승용,가해운전자 차종_이륜,가해운전자 차종_자전거,가해운전자 차종_화물,피해운전자 차종_보행자,피해운전자 차종_승용,피해운전자 차종_이륜,피해운전자 차종_자전거,피해운전자 차종_화물
0,31,3,65.0,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
1,32,3,54.0,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
2,26,3,26.0,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
3,29,3,25.0,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
4,42,3,37.0,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False


In [5]:
target.head()

0    1
1    4
2    1
3    0
4    0
Name: 피해운전자 상해정도, dtype: int64

In [6]:
# 마지막 평가를 위한 test 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, stratify=target, random_state=42)

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4], 
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")



Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7343


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [None, 30, 50],
    'min_samples_split': [10, 15],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74324324 0.73258836 0.74265662 0.74161684 0.72809982]
Mean Cross-Validation Accuracy: 0.7376


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 150, 300], 
    'max_depth': [None, 30, 50],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [4, 6, 8],
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")


cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74324324 0.73258836 0.74265662 0.74161684 0.72809982]
Mean Cross-Validation Accuracy: 0.7376


In [19]:
param_grid = {
    'n_estimators': [190, 200, 210],
    'max_depth': [None, 20],
    'min_samples_split': [10, 11, 12],
    'min_samples_leaf': [4, 5], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74324324 0.73258836 0.74265662 0.74161684 0.72809982]
Mean Cross-Validation Accuracy: 0.7376


In [20]:
param_grid = {
    'n_estimators': [195, 200, 205],
    'min_samples_split': [8, 9, 10, 11],
    'min_samples_leaf': [2, 3, 4, 5], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 96 candidates, totalling 480 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74324324 0.73258836 0.74265662 0.74161684 0.72809982]
Mean Cross-Validation Accuracy: 0.7376


In [21]:
param_grid = {
    'n_estimators': [200, 205, 800],
    'min_samples_split': [6, 7, 10],
    'min_samples_leaf': [2, 3, 4], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74324324 0.73258836 0.74265662 0.74161684 0.72809982]
Mean Cross-Validation Accuracy: 0.7376


In [8]:
param_grid = {
    'n_estimators': [200, 205, 1000],
    'min_samples_split': [5, 10,12],
    'min_samples_leaf': [4, 5, 6], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74324324 0.73258836 0.74265662 0.74161684 0.72809982]
Mean Cross-Validation Accuracy: 0.7376


In [9]:
param_grid = {
    'n_estimators': [196, 200, 204],
    'min_samples_split': [5, 10, 12],
    'min_samples_leaf': [4, 5, 6], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74324324 0.73258836 0.74265662 0.74161684 0.72809982]
Mean Cross-Validation Accuracy: 0.7376


In [10]:
param_grid = {
    'n_estimators': [197, 200, 203],
    'min_samples_split': [5, 10, 12],
    'min_samples_leaf': [4, 5, 6], 
    'max_features': ['auto', 'sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits


135 fits failed out of a total of 405.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
68 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\miniconda3\envs\baseball_project\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\miniconda3\envs\baseball_project\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\user\miniconda3\envs\baseball_project\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\user\miniconda3\envs\baseball_project\Lib\site-packages\sklearn\utils\_p

Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74324324 0.73258836 0.74265662 0.74161684 0.72809982]
Mean Cross-Validation Accuracy: 0.7376


In [11]:
param_grid = {
    'n_estimators': [198, 200, 202],
    'min_samples_split': [5, 10, 12],
    'min_samples_leaf': [4, 5, 6], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7376
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74324324 0.73258836 0.74265662 0.74161684 0.72809982]
Mean Cross-Validation Accuracy: 0.7376


In [12]:
param_grid = {
    'n_estimators': [199, 200, 201],
    'min_samples_split': [6, 7, 10],
    'min_samples_leaf': [3, 4, 5, 6], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 199}
Best Cross-Validation Accuracy: 0.7377
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74350312 0.73284823 0.74265662 0.74109696 0.72835976]
Mean Cross-Validation Accuracy: 0.7377


In [13]:
param_grid = {
    'n_estimators': [199, 200],
    'min_samples_split': [9, 10, 11],
    'min_samples_leaf': [2, 3, 4, 5, 6], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_

y_test_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 60 candidates, totalling 300 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 199}
Best Cross-Validation Accuracy: 0.7377
Test Set Accuracy: 0.7343
Cross-Validation Scores: [0.74350312 0.73284823 0.74265662 0.74109696 0.72835976]
Mean Cross-Validation Accuracy: 0.7377


In [15]:
# 무한대 값 확인 함수
def check_inf_values(df):
    inf_mask = np.isinf(df)  # 무한대 값이 있는 위치를 True로 표시
    inf_count_per_column = inf_mask.sum()  # 각 열의 무한대 값 개수 계산
    total_inf_count = inf_mask.sum().sum()  # 전체 무한대 값 개수 계산
    
    print("무한대 값이 있는 열별 개수:")
    print(inf_count_per_column[inf_count_per_column > 0])  # 무한대 값이 있는 열만 출력
    print(f"\n전체 무한대 값 개수: {total_inf_count}")

# 예시 사용법 (X_train과 X_test가 DataFrame인 경우)
check_inf_values(X_train)
check_inf_values(X_test)

무한대 값이 있는 열별 개수:
Series([], dtype: int64)

전체 무한대 값 개수: 0
무한대 값이 있는 열별 개수:
Series([], dtype: int64)

전체 무한대 값 개수: 0


### 스케일링

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [199, 200],
    'min_samples_split': [9, 10, 11],
    'min_samples_leaf': [3, 4, 5, 6], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train_scaled, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_
y_test_pred = best_rf.predict(X_test_scaled)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

cross_val_scores = cross_val_score(best_rf, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 199}
Best Cross-Validation Accuracy: 0.7377
Test Set Accuracy: 0.7345
Cross-Validation Scores: [0.74324324 0.73284823 0.74265662 0.74109696 0.72887965]
Mean Cross-Validation Accuracy: 0.7377


In [17]:
from sklearn.metrics import f1_score, accuracy_score, classification_report

rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [199, 200],
    'min_samples_split': [9, 10, 11],
    'min_samples_leaf': [3, 4, 5, 6], 
    'max_features': ['sqrt', 'log2'],
}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train_scaled, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_
y_test_pred = best_rf.predict(X_test_scaled)


test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# F1-스코어 계산 (평균 'macro' 또는 'weighted'로 선택)
f1 = f1_score(y_test, y_test_pred, average='weighted')  # 다중 클래스인 경우 'weighted'를 사용
print(f"Test Set F1 Score (Weighted): {f1:.4f}")

# 정밀도, 재현율, F1-스코어
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 199}
Best Cross-Validation Accuracy: 0.7377
Test Set Accuracy: 0.7345
Test Set F1 Score (Weighted): 0.6768

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.97      0.84      3257
           1       0.86      0.37      0.51       196
           2       1.00      0.07      0.13        29
           3       0.78      0.44      0.56       391
           4       0.63      0.13      0.22       937

    accuracy                           0.73      4810
   macro avg       0.80      0.40      0.45      4810
weighted avg       0.72      0.73      0.68      4810



### StratifiedKFold 사용
각 폴드에서 클래스 분포가 비슷하도록 하거나, 폴드 수를 늘려 더 세밀한 검증을 시도

In [18]:
from sklearn.model_selection import StratifiedKFold
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [199, 200],
    'min_samples_split': [9, 10, 11],
    'min_samples_leaf': [3, 4, 5, 6], 
    'max_features': ['sqrt', 'log2'],
}

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=skf, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train_scaled, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_
y_test_pred = best_rf.predict(X_test_scaled)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test Set F1 Score (Weighted): {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

cross_val_scores = cross_val_score(best_rf, X_train_scaled, y_train, cv=skf, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 10 folds for each of 48 candidates, totalling 480 fits
Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.7369
Test Set Accuracy: 0.7310
Test Set F1 Score (Weighted): 0.6745

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.96      0.83      3257
           1       0.84      0.37      0.51       196
           2       1.00      0.03      0.07        29
           3       0.77      0.44      0.56       391
           4       0.59      0.14      0.22       937

    accuracy                           0.73      4810
   macro avg       0.79      0.39      0.44      4810
weighted avg       0.71      0.73      0.67      4810

Cross-Validation Scores: [0.73180873 0.73076923 0.73544699 0.73492723 0.74376299 0.73804574
 0.73180873 0.7425897  0.73842954 0.74154966]
Mean Cross-Validation Accuracy: 0.7369


In [19]:
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 201, 202],
    'min_samples_split': [9, 10, 11],
    'min_samples_leaf': [2, 3, 4, 5], 
    'max_features': ['sqrt', 'log2'],
}

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

gs = GridSearchCV(estimator=rf, param_grid=param_grid, 
                  cv=skf, scoring='accuracy', n_jobs=-1, verbose=2)

gs.fit(X_train_scaled, y_train)

print(f"Best Parameters: {gs.best_params_}")
print(f"Best Cross-Validation Accuracy: {gs.best_score_:.4f}")

best_rf = gs.best_estimator_
y_test_pred = best_rf.predict(X_test_scaled)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test Set F1 Score (Weighted): {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

cross_val_scores = cross_val_score(best_rf, X_train_scaled, y_train, cv=skf, scoring='accuracy')
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {cross_val_scores.mean():.4f}")

Fitting 10 folds for each of 72 candidates, totalling 720 fits
Best Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 202}
Best Cross-Validation Accuracy: 0.7377
Test Set Accuracy: 0.7326
Test Set F1 Score (Weighted): 0.6770

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.97      0.83      3257
           1       0.86      0.37      0.51       196
           2       1.00      0.07      0.13        29
           3       0.77      0.45      0.57       391
           4       0.60      0.14      0.23       937

    accuracy                           0.73      4810
   macro avg       0.79      0.40      0.45      4810
weighted avg       0.72      0.73      0.68      4810

Cross-Validation Scores: [0.73076923 0.73232848 0.73596674 0.73700624 0.74376299 0.74116424
 0.73492723 0.74154966 0.7373895  0.7425897 ]
Mean Cross-Validation Accuracy: 0.7377
