# 多分类水果分类器

In [22]:
import numpy as np
import pandas as pd
import cv2

import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import glob
import os
import warnings
warnings.filterwarnings('ignore')

## 导入图片数据

In [2]:
# 读入部分图片数据以及标签
fruit_images = [] 
labels = [] 
fruit_names=['Apple Braeburn','Apple Crimson Snow','Apple Red 2','Peach']
path_head="Training200\\"
for fruit_name in fruit_names:
    fruit_dir_path = path_head+fruit_name
    fruit_label = fruit_dir_path.split("\\")[-1]
    for image_path in glob.glob(os.path.join(fruit_dir_path, "*.jpg")):
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        fruit_images.append(image)
        labels.append(fruit_label)
fruit_images = np.array(fruit_images)
labels = np.array(labels)

In [3]:
# 提取标签数据并编号
label_to_id_dict = {v:i for i,v in enumerate(np.unique(labels))}
id_to_label_dict = {v:k for k,v in label_to_id_dict.items()}
id_to_label_dict

{0: 'Apple Braeburn', 1: 'Apple Crimson Snow', 2: 'Apple Red 2', 3: 'Peach'}

In [4]:
# 标签编号
label_ids = np.array([label_to_id_dict[x] for x in labels])

## PCA降维

In [5]:
# 生成图片向量
scaler = StandardScaler() # 数据归一化
images_scaled = scaler.fit_transform([i.flatten() for i in fruit_images])

In [6]:
# PCA降维
pca = PCA(n_components=10)
pca_result = pca.fit_transform(images_scaled)

In [7]:
data = pd.concat([pd.DataFrame(pca_result), pd.DataFrame(label_ids,columns=['label'])], axis=1)
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
0,-47.228477,-100.133847,45.148065,10.928912,14.380786,10.081558,20.413679,11.357985,-18.291389,7.678771,0
1,24.999174,-110.263826,31.297644,-23.99222,-3.127587,3.644606,-5.598442,-17.710118,1.731891,-6.591079,0
2,28.223779,-114.9282,24.32186,-36.092859,-1.928604,2.943227,-5.926898,-6.996024,2.967388,-6.046758,0
3,29.286895,-118.130954,22.334259,-35.262925,-0.415461,5.498085,-7.728139,-3.90387,5.198932,-2.798795,0
4,30.144339,-118.91712,22.583146,-33.91037,0.335816,6.315519,-8.50733,-3.554027,6.083558,-2.292688,0


In [8]:
X = data.loc[:, data.columns != 'label']
Y = data.loc[:,'label']

## 模型训练

In [33]:
from sklearn.svm import SVC
svm_default = SVC()
svm_default.fit(X, Y) 

SVC()

In [10]:
svm_clf = SVC(kernel='rbf',C=1, gamma=0.0001,probability=True)
svm_clf.fit(X, Y) 

SVC(C=1, gamma=0.0001, probability=True)

In [42]:
from sklearn.neighbors import KNeighborsClassifier     #K近邻#
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X, Y)

KNeighborsClassifier(n_neighbors=4)

In [14]:
from sklearn import tree                              #决策树分类器#
tree = tree.DecisionTreeClassifier(criterion='gini')
tree.fit(X, Y)

DecisionTreeClassifier()

In [16]:
from sklearn.ensemble import RandomForestClassifier  #随机森林#
from sklearn.ensemble import RandomForestRegressor
RandomForest = RandomForestClassifier()
RandomForest.fit(X, Y)

RandomForestClassifier()

## 在测试数据集上测试

In [45]:
import os, random
path_head="Test"
def TestWorkFlow(model,picknumber=50):
    # 1 获取测试集
    validation_fruit_images = []
    validation_labels = [] 
    for fruit_name in fruit_names:
        fruit_dir_path = "{0}/{1}/".format(path_head,fruit_name)
        pathDir = os.listdir(fruit_dir_path)    #取图片的原始路径
        filenumber=len(pathDir) # 源文件夹图片数量
        sample = random.sample(pathDir, picknumber)  #随机选取picknumber数量的样本图片
        for name in sample:
            image_path = "{0}{1}".format(fruit_dir_path,name)
            image = cv2.imread(image_path, cv2.IMREAD_COLOR)
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            validation_fruit_images.append(image)
            validation_labels.append(fruit_name)
    validation_fruit_images = np.array(validation_fruit_images)
    validation_labels = np.array(validation_labels)
    validation_label_ids = np.array([label_to_id_dict[x] for x in validation_labels])
    validation_images_scaled = scaler.transform([i.flatten() for i in validation_fruit_images])
    validation_pca_result = pca.transform(validation_images_scaled)
    test_predictions = model.predict(validation_pca_result)
    precision = accuracy_score(test_predictions, validation_label_ids)
    return precision
def Model_test(model):
    res = []
    x = range(0,10,1)
    # 10轮训练取平均
    for i in x:
        y = TestWorkFlow(model)
        res.append(y)
    return np.mean(res)

In [46]:
# 默认参数的SVM
Model_test(svm_default)

0.8870000000000001

In [47]:
# 调参后的SVM
Model_test(svm_clf)

0.9164999999999999

In [48]:
# KNN
Model_test(knn)

0.899

In [49]:
# 决策树
Model_test(tree)

0.8085000000000001

In [50]:
# 随机森林
Model_test(RandomForest)

0.858