### [2025_12_11]_독버섯 감지 & 유방암 감지
- 필수
   * 교차검증
   * 데이터 누수 안됨! 
   * 모델 : 앙상블 계열 => Voting, 배깅은 RandomForest

### 독버섯 감지기

[1] 모듈 로딩 및 데이터 준비 <hr>

In [2]:
## ==================================================
## [1-1] 모듈 로딩
## ==================================================
import pandas as pd
import numpy as np


## ML학습 관련
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

## ML 데이터셋 및 전처리 관련
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

## ML CV, Pipeline 관련
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
## ==================================================
## [1-2] 데이터 준비 및 확인
## ==================================================
## 데이터
DATA_FILE = '../Data/mushrooms.csv'

## 데이터 로딩
mrDF = pd.read_csv(DATA_FILE)

## 데이터 기본정보 확인
display (mrDF.head(3) )
mrDF.info()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

[2] 학습 준비 <hr>

In [4]:
## ==================================================
## [2-1] 피쳐/타겟 분리
## ==================================================
featureDF = mrDF[mrDF.columns[1:-1]]
targetSR  = mrDF[mrDF.columns[0]]

print(f'featureDF:{featureDF.shape},  targetSR:{targetSR.shape}')

featureDF:(8124, 21),  targetSR:(8124,)


In [5]:
## ==================================================
## [2-2] 학습용/테스트용 분리
## ==================================================
x_train, x_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=targetSR)

print(f'[TRAIN] x_train:{x_train.shape},  y_train:{y_train.shape}')
print(f'[TEST] x_test:{x_test.shape},  y_test:{y_test.shape}')

[TRAIN] x_train:(6499, 21),  y_train:(6499,)
[TEST] x_test:(1625, 21),  y_test:(1625,)


In [6]:
## ==================================================
## [2-3] 타켓 컬럼 인코딩 처리 
## ==================================================
## => 타겟은 클래스 구분만 하면 되므로 순서 의미 없는 레이블 인코딩
lbEncoder = LabelEncoder()

en_y_train = lbEncoder.fit_transform(y_train)
en_y_test  = lbEncoder.transform(y_test)

en_y_train  # 테스트 출력

array([1, 0, 0, ..., 0, 0, 0], shape=(6499,))

In [7]:
## ==================================================
## [2-4] 문자형 피쳐 컬럼 인코딩 처리
## ==================================================
## => 전부 명목형 범주 데이터로 크기, 순서 없이 종류만 있다
## => OneHotEncoding이 가장 적합해 보인다.
ohEncoder = OneHotEncoder(sparse_output=False)

oh_x_train = ohEncoder.fit_transform(x_train)
oh_x_test  = ohEncoder.transform(x_test)

oh_x_train  # 테스트 출력

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 1., 0.]], shape=(6499, 110))

In [8]:
## 보팅 학습에 사용될 모델/학습기들 생성
knn = KNeighborsClassifier()
svc = SVC(probability=True)
dt  = DecisionTreeClassifier(random_state=10)

In [9]:
## 보팅 인스턴스 생성
vtModel = VotingClassifier(estimators=[('knn',knn), ('svc', svc), ('dt', dt)])

## 보팅 동일 데이터셋으로 다른 학습 알고리즘으로 학습 진행
vtModel.fit(oh_x_train, en_y_train)

0,1,2
,estimators,"[('knn', ...), ('svc', ...), ...]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,10
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [10]:
## 학습 후 모델 파라미터 확인
print(f'classes_          : {vtModel.classes_}' )
print(f'estimators_       : {vtModel.estimators_}' )
print(f'named_estimators_ : {vtModel.named_estimators_}' )

classes_          : [0 1]
estimators_       : [KNeighborsClassifier(), SVC(probability=True), DecisionTreeClassifier(random_state=10)]
named_estimators_ : {'knn': KNeighborsClassifier(), 'svc': SVC(probability=True), 'dt': DecisionTreeClassifier(random_state=10)}


In [11]:
## 성능평가
vtModel.score(oh_x_test, en_y_test)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



1.0

In [12]:
## ===================================================================
## 교차검증을 통한 일반화 성능 체크 + 최적의 하이퍼파라미터 찾기
## => GrdiSearchCV : 분류/회귀 교차검증, 최적 파라미터값, 성능 
## => Pipeline     : 튜닝 시 데이터 누수에 대한 보완 
## ===================================================================
## [3-1] Pipeline 인스턴스 생성 
##       -> 문자형 피쳐 : 인코딩
##       -> 모델 인스턴스 
pipeline = Pipeline(steps=[  ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)), 
                             ('dt_cls', DecisionTreeClassifier(random_state=42)) ]) 

## [3-2] GridSearchCV 준비
##       -> 최적화할 하이퍼파라미터 값 dict 
##       -> 모델 인스턴스 
##            { "pipeline_모델변수명_ _하이퍼파라미터이름" : [학습알고리즘 클래스 확인]  }
param_dict_ = { "dt_cls__max_depth" : [3, 5, 7], 
                "dt_cls__criterion" : ['gini', 'entropy', 'log_loss']}

## GridSearchCV 인스턴스 생성
gridCV = GridSearchCV(pipeline,
                      param_grid=param_dict_,
                      cv=5,
                      scoring="accuracy",
                      return_train_score=True)


In [13]:
## [3-3] 교차검증 통한 최적의 하이퍼라마터 조사 및 성능 평가
gridCV.fit(x_train, en_y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'dt_cls__criterion': ['gini', 'entropy', ...], 'dt_cls__max_depth': [3, 5, ...]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [14]:
## 학습 후 모델 파라미터 통한 결과 확인 
print(f'best_score_  : {gridCV.best_score_}')
print(f'best_params_ : {gridCV.best_params_}')

best_score_  : 0.9996923076923077
best_params_ : {'dt_cls__criterion': 'gini', 'dt_cls__max_depth': 5}


In [15]:
## 전체 교차 검증별 학습점수, 검증점수 cv_results_
pd.options.mode.copy_on_write=True

resultDF  = pd.DataFrame(gridCV.cv_results_)
print( resultDF.columns )

resultDF  = resultDF[['rank_test_score', 'mean_test_score', 'mean_train_score', 'param_dt_cls__max_depth', 'param_dt_cls__criterion']]
resultDF['diff'] = abs(resultDF['mean_test_score'] - resultDF['mean_train_score'])

resultDF.rename(columns={'param_dt_cls__max_depth':'max_depth', 
                         'param_dt_cls__criterion':'criterion',
                         'rank_test_score':'rank_test'}, inplace=True)

resultDF.sort_values(by=['rank_test'])


Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_dt_cls__criterion', 'param_dt_cls__max_depth', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'split0_train_score',
       'split1_train_score', 'split2_train_score', 'split3_train_score',
       'split4_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')


Unnamed: 0,rank_test,mean_test_score,mean_train_score,max_depth,criterion,diff
1,1,0.999692,0.999846,5,gini,0.000153824
2,1,0.999692,1.0,7,gini,0.0003076923
5,1,0.999692,1.0,7,entropy,0.0003076923
4,1,0.999692,0.999846,5,entropy,0.000153824
7,1,0.999692,0.999846,5,log_loss,0.000153824
8,1,0.999692,1.0,7,log_loss,0.0003076923
0,7,0.985998,0.985998,3,gini,9.007294e-08
3,8,0.962609,0.96261,3,entropy,1.587724e-07
6,8,0.962609,0.96261,3,log_loss,1.587724e-07


### - 독버섯 감지기 결론
- Voting 모델은 단일 평가에서는 정확도 1.0 => 교차검증 결과 정확도 0.99로 소폭 감소
- 이는 모델의 일반화 성능 추정이 더 보수적으로 나온 것으로, 일부 과적합 가능성도 시사할 수 있다.
    * (교차검증 점수가 낮게 나오는 건 모델이 나쁜 게 아니라, 평가가 정직해졌다는 뜻이다.)


### 유방암 감지기

In [16]:
## ==================================================
## [1-1] 모듈 로딩
## ==================================================
import pandas as pd
import numpy as np


## ML학습 관련
from sklearn.ensemble import RandomForestClassifier

## ML 데이터셋 및 전처리 관련
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, LabelEncoder

## ML CV, Pipeline 관련
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

## ML 성능지표 관련
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.metrics import classification_report

## 시각화 관련
import matplotlib.pyplot as plt
import graphviz

In [17]:
## ==================================================
## [1-2] 데이터 준비 및 확인
## ==================================================
## 데이터
DATA_FILE2 = '../Data/wdbc.csv'

## 데이터 로딩
wdDF = pd.read_csv(DATA_FILE2)

## 데이터 기본정보 확인
display (wdDF.head(3) )
wdDF.info()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave_points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [18]:
## 필요없는 id 컬럼 삭제
wdDF = wdDF.drop('id', axis=1)

In [19]:
## ==================================================
## [2-1] 피쳐/타겟 분리
## ==================================================
featureDF = wdDF[wdDF.columns[1:-1]]
targetSR  = wdDF[wdDF.columns[0]]

print(f'featureDF:{featureDF.shape},  targetSR:{targetSR.shape}')

featureDF:(569, 29),  targetSR:(569,)


In [20]:
## ==================================================
## [2-2] 학습용/테스트용 분리
## ==================================================
x_train, x_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=targetSR)

print(f'[TRAIN] x_train:{x_train.shape},  y_train:{y_train.shape}')
print(f'[TEST] x_test:{x_test.shape},  y_test:{y_test.shape}')

[TRAIN] x_train:(455, 29),  y_train:(455,)
[TEST] x_test:(114, 29),  y_test:(114,)


In [21]:
## ==================================================
## [2-3] 타켓 컬럼 인코딩 처리 
## ==================================================
## => 타겟은 클래스 구분만 하면 되므로 순서 의미 없는 레이블 인코딩
lbEncoder = LabelEncoder()

en_y_train = lbEncoder.fit_transform(y_train)
en_y_test  = lbEncoder.transform(y_test)

In [22]:
## ==================================================
## [2-4] 수치형 피쳐 컬럼 스케일링 처리
## ==================================================
## => 랜덤포레스트는 값의 크기가 아닌 분기의 기준을 보기 때문에
##    RandomForest 단독 모델에서 스케일링 생략 가능

In [23]:
## 모델 학습 및 예측 평가하기
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, en_y_train)
pred = rf.predict(x_test)

print("Test Accuracy:", accuracy_score(en_y_test, pred))
print("Test Precision:", precision_score(en_y_test, pred))
print("Test Recall:", recall_score(en_y_test, pred))
print("Test F1-Score:", f1_score(en_y_test, pred, average='macro'))

print(classification_report(en_y_test, pred, target_names=lbEncoder.classes_))

Test Accuracy: 0.9736842105263158
Test Precision: 1.0
Test Recall: 0.9285714285714286
Test F1-Score: 0.9712773998488284
              precision    recall  f1-score   support

           B       0.96      1.00      0.98        72
           M       1.00      0.93      0.96        42

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114



In [24]:
## cv로 일반화 성능 체크를 통한 교차검증
from sklearn.model_selection import StratifiedKFold, cross_validate

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(
    rf, featureDF, lbEncoder.fit_transform(targetSR),
    cv=cv,
    scoring=['accuracy', 'precision', 'recall', 'f1'],
    return_train_score=True
)

print("CV valid accuracy mean:", scores['test_accuracy'].mean())
print("CV valid f1 mean:", scores['test_f1'].mean())
print("CV train accuracy mean:", scores['train_accuracy'].mean())


CV valid accuracy mean: 0.9543238627542306
CV valid f1 mean: 0.9370620842572063
CV train accuracy mean: 1.0


In [25]:
## 최적의 하이퍼파라미터 찾기 + 
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=cv,
    scoring='f1',          
    n_jobs=-1,
    return_train_score=True
)

grid.fit(x_train, en_y_train)

print("best params:", grid.best_params_)
print("best cv score:", grid.best_score_)
best_rf = grid.best_estimator_

print("Test score:", best_rf.score(x_test, en_y_test))


best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
best cv score: 0.9521430281746349
Test score: 0.9736842105263158
