### [2025_12_22 : 이미지 분류 모델]

In [None]:
## ==================================================
## [1-1] 모듈 로딩
## ==================================================
import pandas as pd 
import numpy as np
import os
import cv2

## ML학습 관련
from sklearn.ensemble import RandomForestClassifier

## ML 데이터셋 및 전처리 관련
from sklearn.model_selection import train_test_split

## ML CV 관련
from sklearn.model_selection import GridSearchCV

## ML 성능지표 관련
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 

In [44]:
## ==================================================
## [1-2] 데이터 준비 및 확인
## ==================================================
## 데이터
DATA_FILE  = '../Data/csv/apples.csv'
DATA_FILE2 = '../Data/csv/Dubai.csv'

## 데이터 로딩
df1 = pd.read_csv(DATA_FILE, header=None)
df2 = pd.read_csv(DATA_FILE2, header=None)
df = pd.concat([df1, df2], axis=0)

## 데이터 기본정보 확인
display( df.head(2) )
display( df.tail(2) )
df.info()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4891,4892,4893,4894,4895,4896,4897,4898,4899,4900
0,apple,117,122,128,129,112,116,119,118,125,...,184,182,181,180,183,185,184,182,180,180
1,apple,206,207,208,210,211,211,212,212,213,...,235,235,234,234,234,234,234,234,234,234


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4891,4892,4893,4894,4895,4896,4897,4898,4899,4900
25,dubai,98,102,101,95,93,93,96,105,106,...,130,139,129,125,117,93,101,99,107,110
26,dubai,100,100,99,99,94,94,98,98,101,...,125,125,117,117,118,118,112,112,117,117


<class 'pandas.core.frame.DataFrame'>
Index: 94 entries, 0 to 26
Columns: 4901 entries, 0 to 4900
dtypes: int64(4900), object(1)
memory usage: 3.5+ MB


In [45]:
## ==================================================
## [2-1] 피쳐/타겟 분리
## ==================================================
featureDF = df[df.columns[1:]]
targetSR  = df[df.columns[0]]

print(f'featureDF:{featureDF.shape},  targetSR:{targetSR.shape}')

featureDF:(94, 4900),  targetSR:(94,)


In [46]:
## ==================================================
## [2-2] 학습용/테스트용 분리
## ==================================================
x_train, x_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    test_size=0.15,
                                                    random_state=42,
                                                    stratify=targetSR)

print(f'[TRAIN] x_train:{x_train.shape},  y_train:{y_train.shape}')
print(f'[TEST] x_test:{x_test.shape},  y_test:{y_test.shape}')

[TRAIN] x_train:(79, 4900),  y_train:(79,)
[TEST] x_test:(15, 4900),  y_test:(15,)


In [47]:
# 최적의 하이퍼파라미터 찾기
rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'criterion': ['gini', 'entropy']
}

grid_rf = GridSearchCV(estimator=rf,
                       param_grid=param_grid,
                       cv=5,
                       scoring='accuracy',
                       n_jobs=-1
)

grid_rf.fit(x_train, y_train)

# 결과 확인
print("최적 파라미터:", grid_rf.best_params_)
print("정확도:", grid_rf.best_score_)


최적 파라미터: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
정확도: 0.8483333333333334


In [None]:
# 모델 생성
rModel = RandomForestClassifier(n_estimators=100,
                                criterion='gini',
                                min_samples_leaf= 1,
                                min_samples_split= 2,
                                random_state=42)

# 학습
rModel.fit(x_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [52]:
# 성능 평가
train_score = rModel.score(x_train, y_train)
test_score  = rModel.score(x_test, y_test)

print(f'train_score : {train_score}   test_score : {test_score}')

train_score : 1.0   test_score : 0.8


In [53]:
# 예측
y_pred = rModel.predict(x_test)

# 성능 평가 지표
print("정확도:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

정확도: 0.8
              precision    recall  f1-score   support

       apple       0.79      1.00      0.88        11
       dubai       1.00      0.25      0.40         4

    accuracy                           0.80        15
   macro avg       0.89      0.62      0.64        15
weighted avg       0.84      0.80      0.75        15



### 임의의 이미지로 예측해보기

In [54]:
pred_img1 = '../Data/papple.jpg'    # 예측할 사과 이미지
pred_img2 = '../Data/pdubai.jpg'    # 예측할 두쫀쿠 이미지

In [57]:
## 이미지 크기 및 벡터 변환
img1 = cv2.imread(pred_img1, cv2.IMREAD_GRAYSCALE)    
img1 = cv2.resize(img1, (0, 0), fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)
img1 = cv2.resize(img1, (70, 70), interpolation = cv2.INTER_AREA)
img1 = img1.reshape(1, -1)

img2 = cv2.imread(pred_img2, cv2.IMREAD_GRAYSCALE)    
img2 = cv2.resize(img2, (0, 0), fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)
img2 = cv2.resize(img2, (70, 70), interpolation = cv2.INTER_AREA)
img2 = img2.reshape(1, -1)

In [68]:
pred_label = rModel.predict(img1)
proba = rModel.predict_proba(img1)

print("사진 예측 결과 :", pred_label[0])     

print(f'사과로 예측할 확률 : {proba[0][0]*100}%')
print(f'두쫀쿠로 예측할 확률 : {proba[0][1]*100}%')

사진 예측 결과 : apple
사과로 예측할 확률 : 97.0%
두쫀쿠로 예측할 확률 : 3.0%


In [69]:
pred_label = rModel.predict(img2)
proba = rModel.predict_proba(img2)

print("사진 예측 결과 :", pred_label[0])     

print(f'사과로 예측할 확률 : {proba[0][0]*100}%')
print(f'두쫀쿠로 예측할 확률 : {proba[0][1]*100}%')

사진 예측 결과 : apple
사과로 예측할 확률 : 50.0%
두쫀쿠로 예측할 확률 : 50.0%
