### [2025_12_22 : 이미지 분류 모델]

In [25]:
## ==================================================
## [1-1] 모듈 로딩
## ==================================================
import pandas as pd 
import numpy as np
import os
#import cv2

## ML학습 관련
from sklearn.tree import DecisionTreeClassifier

## ML 데이터셋 및 전처리 관련
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## ML CV, Pipeline 관련 => 모델 일반화/최적 하이퍼파라미터 조사 및 데이터 누수 해결
from sklearn.model_selection import GridSearchCV

## ML 성능지표 관련
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.metrics import classification_report 

In [26]:
## ==================================================
## [1-2] 데이터 준비 및 확인
## ==================================================
## 데이터
DATA_FILE  = '../Data/csv/fruits.csv'
DATA_FILE2 = '../Data/csv/dubai.csv'

## 데이터 로딩
df1 = pd.read_csv(DATA_FILE, header=None)
df2 = pd.read_csv(DATA_FILE2, header=None)
df = pd.concat([df1, df2], axis=0)

## 데이터 기본정보 확인
display( df.head(2) )
display( df.tail(2) )
df.info()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4891,4892,4893,4894,4895,4896,4897,4898,4899,4900
0,apple,98,101,103,95,94,92,97,103,106,...,129,134,123,122,120,103,107,102,109,113
1,apple,233,233,233,232,232,232,232,232,232,...,232,232,232,232,231,231,230,229,229,228


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4891,4892,4893,4894,4895,4896,4897,4898,4899,4900
24,Dubai,184,186,188,192,198,200,197,168,104,...,198,206,206,207,208,205,208,205,203,197
25,Dubai,98,102,101,95,93,93,96,105,106,...,130,139,129,125,117,93,101,99,107,110


<class 'pandas.core.frame.DataFrame'>
Index: 52 entries, 0 to 25
Columns: 4901 entries, 0 to 4900
dtypes: int64(4900), object(1)
memory usage: 1.9+ MB


In [32]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4891,4892,4893,4894,4895,4896,4897,4898,4899,4900
0,apple,98,101,103,95,94,92,97,103,106,...,129,134,123,122,120,103,107,102,109,113
1,apple,233,233,233,232,232,232,232,232,232,...,232,232,232,232,231,231,230,229,229,228
2,apple,200,200,213,213,218,217,219,220,219,...,65,65,60,55,51,49,51,49,57,62
3,apple,234,233,234,233,233,233,233,233,233,...,56,53,82,70,60,69,51,87,79,80
4,apple,210,210,210,210,210,210,210,210,210,...,187,182,187,188,189,190,190,190,191,192
5,apple,184,184,186,191,198,201,195,168,99,...,203,203,205,208,208,202,209,206,204,199
6,apple,167,168,171,181,188,185,185,182,185,...,173,168,151,153,165,166,152,145,150,153
7,apple,158,138,129,128,120,111,111,108,107,...,39,30,16,15,31,50,36,24,32,39
8,apple,234,232,228,226,223,220,220,225,227,...,228,226,222,221,220,219,216,218,221,224
9,apple,215,215,215,216,215,216,216,216,216,...,227,228,228,228,228,228,228,227,227,227


In [27]:
## ==================================================
## [2-1] 피쳐/타겟 분리
## ==================================================
featureDF = df[df.columns[1:]]
targetSR  = df[df.columns[0]]

print(f'featureDF:{featureDF.shape},  targetSR:{targetSR.shape}')

featureDF:(52, 4900),  targetSR:(52,)


In [28]:
## ==================================================
## [2-2] 학습용/테스트용 분리
## ==================================================
x_train, x_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    test_size=0.15,
                                                    random_state=42,
                                                    stratify=targetSR)

print(f'[TRAIN] x_train:{x_train.shape},  y_train:{y_train.shape}')
print(f'[TEST] x_test:{x_test.shape},  y_test:{y_test.shape}')

[TRAIN] x_train:(44, 4900),  y_train:(44,)
[TEST] x_test:(8, 4900),  y_test:(8,)


In [29]:
# 최적의 하이퍼파라미터 찾기
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini', 'entropy']
}

grid_rf = GridSearchCV(estimator=rf,
                       param_grid=param_grid,
                       cv=5,
                       scoring='accuracy',      # 정확도! 얼마나 맞췄는가가 중요 -> 과일 분류의 목적 = 정확한 분류
                       n_jobs=-1
)

grid_rf.fit(x_train, y_train)

# 결과 확인
print("최적 파라미터:", grid_rf.best_params_)
print("정확도:", grid_rf.best_score_)


최적 파라미터: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
정확도: 0.16111111111111112


In [30]:
# 모델 생성
rModel = RandomForestClassifier(n_estimators=100,
                                criterion='gini',
                                min_samples_split= 5,
                                random_state=42)

# 학습
rModel.fit(x_train, y_train)

# 예측
y_pred = rModel.predict(x_test)

In [31]:
# 성능 평가
train_score = rModel.score(x_train, y_train)
test_score  = rModel.score(x_test, y_test)

print(f'train_score : {train_score}   test_score : {test_score}')

train_score : 0.5909090909090909   test_score : 0.0
