In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import os
import math

import torch
import torch.nn as nn
from torch.functional import F

os.makedirs('result', exist_ok=True)
plt.rcParams['font.sans-serif'] = 'simhei'  # 设置字体
plt.rcParams['axes.unicode_minus'] = False

In [2]:
# 读入鸢尾花数据
iris_df = pd.read_csv('iris.csv', encoding='gbk', header=None)
# 重新设置数据列名
iris_df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'category']
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,category
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
iris_x = iris_df.drop(columns=['category'])
iris_y = iris_df['category']

In [4]:
watermelon_df = pd.read_csv('watermelon.csv', index_col=0)
watermelon_df.head()

Unnamed: 0_level_0,色泽,根蒂,敲声,纹理,脐部,触感,好瓜
编号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,是
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,是
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,是
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,是
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,是


In [5]:
def category2idx(data, select_col='好瓜'):
    word2id = {ds:n for n,ds in enumerate(set(data[select_col]))}
    data[select_col] = data[select_col].map(lambda x : word2id[x])

for col in watermelon_df.columns:
    category2idx(watermelon_df, col)
watermelon_df.head()

Unnamed: 0_level_0,色泽,根蒂,敲声,纹理,脐部,触感,好瓜
编号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,2,2,1,2,1,1
2,2,2,0,1,2,1,1
3,2,2,2,1,2,1,1
4,1,2,0,1,2,1,1
5,0,2,2,1,2,1,1


In [6]:
watermelon_x = watermelon_df.drop(columns=['好瓜'])
watermelon_y = watermelon_df['好瓜']

In [7]:
class Classifier:

    def __init__(self, classifier_name, is_discrete=False):
        self.classifier_name = classifier_name
        self.is_discrete = is_discrete
    

    def classify(self, X_train, y_train, X_test, y_test):
        if self.classifier_name == 'KNN':
            from sklearn.neighbors import KNeighborsClassifier
            neigh = KNeighborsClassifier(n_neighbors=6)
            neigh.fit(X_train, y_train)
            return neigh.predict(X_test)
            
        elif self.classifier_name == 'Decision Tree':
            from sklearn.tree import DecisionTreeClassifier
            decision_tree = DecisionTreeClassifier(random_state=0, max_depth=3)
            decision_tree.fit(X_train, y_train)
            return decision_tree.predict(X_test)
        
        elif self.classifier_name == 'Naive Bayes':
            # Import Gaussian Naive Bayes model
            from sklearn.naive_bayes import GaussianNB, CategoricalNB
            # Create a Gaussian Classifier
            if self.is_discrete:
                gnb = CategoricalNB()
            else:
                gnb = GaussianNB()
            # Train the model using the training sets
            gnb.fit(X_train, y_train)
            # Predict the response for test dataset
            return gnb.predict(X_test)
        
        elif self.classifier_name == 'Bayes Belief Network':
            # Import Gaussian Naive Bayes model
            from sklearn.naive_bayes import GaussianNB, CategoricalNB
            # Create a Gaussian Classifier
            if self.is_discrete:
                gnb = CategoricalNB()
            else:
                gnb = GaussianNB()
            # Train the model using the training sets
            gnb.fit(X_train, y_train)
            # Predict the response for test dataset
            return gnb.predict(X_test)
        
        elif self.classifier_name == "Neural Network":
            # Tunning the hidden layer size parameter if Neural Network is chosen
            from sklearn.neural_network import MLPClassifier
            hidden_layer_size, NN_model = self.tune_NN_parameters(X_train, y_train, X_test, y_test)
            print("\n" + str(hidden_layer_size) + " is the chosen hidden layer size for the neural network")
            
            return NN_model.predict(X_test)
        
    def get_accuracy(self, y_test, y_predicted):
        # Import scikit-learn metrics module for accuracy calculation
        # from sklearn import metrics

        # return metrics.accuracy_score(y_test, y_predicted) * 100
    
        return ((np.sum(y_test == y_predicted) / y_test.shape[0])*100)
        
    def tune_NN_parameters(self, X_train, y_train, X_test, y_test):
            from sklearn.neural_network import MLPClassifier
            hidden_layer_sizes = [(600,30)]
            max_accuracy = 0
            index=0
            
            print("Tunning the hidden layer size parameter for the Neural Network...\n")
            
            for i in range(0,len(hidden_layer_sizes)):
                
                model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=hidden_layer_sizes[i]).fit(X_train, y_train)
                temp_accuracy = ((np.sum(y_test == model.predict(X_test)) / y_test.shape[0]) * 100)
                print(str(hidden_layer_sizes[i]) + " (hidden layer size) accuracy: " + str(temp_accuracy) + " %")
            
                if temp_accuracy > max_accuracy:
                    max_accuracy = temp_accuracy
                    NN_best_model = model
                    index = i
            
            return hidden_layer_sizes[index], NN_best_model

In [23]:
def classify(data_x, data_y, method="Naive Bayes", is_discrete=False):
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=12)
    
    classifier = Classifier(method)
    y_pred = classifier.classify(X_train, y_train, X_test, y_test)
    classifier_accuracy = classifier.get_accuracy(y_test, y_pred)
    print(f"{method} predict accuracy = {classifier_accuracy}")

In [24]:
classify(iris_x, iris_y, method="Naive Bayes")
classify(iris_x, iris_y, method="Bayes Belief Network")
classify(iris_x, iris_y, method="Decision Tree")
classify(iris_x, iris_y, method="Neural Network")

Naive Bayes predict accuracy = 96.66666666666667
Bayes Belief Network predict accuracy = 96.66666666666667
Decision Tree predict accuracy = 93.33333333333333
Tunning the hidden layer size parameter for the Neural Network...

(600, 30) (hidden layer size) accuracy: 96.66666666666667 %

(600, 30) is the chosen hidden layer size for the neural network
Neural Network predict accuracy = 96.66666666666667


In [25]:
classify(watermelon_x, watermelon_y, method="Naive Bayes", is_discrete=True)
classify(watermelon_x, watermelon_y, method="Bayes Belief Network", is_discrete=True)
classify(watermelon_x, watermelon_y, method="Decision Tree", is_discrete=True)
classify(watermelon_x, watermelon_y, method="Neural Network", is_discrete=True)

Naive Bayes predict accuracy = 75.0
Bayes Belief Network predict accuracy = 75.0
Decision Tree predict accuracy = 75.0
Tunning the hidden layer size parameter for the Neural Network...

(600, 30) (hidden layer size) accuracy: 25.0 %

(600, 30) is the chosen hidden layer size for the neural network
Neural Network predict accuracy = 25.0


In [11]:
# sklean中朴素贝叶斯预测
def skl_bayes(data_x, data_y, is_discrete=False):
    from sklearn.naive_bayes import GaussianNB, CategoricalNB
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    Xtrain, Xtest, ytrain, ytest = train_test_split(data_x, data_y, test_size=0.3, random_state=12)

    # GaussianNB 建模
    if not is_discrete:
        clf = GaussianNB()
    else:
        clf = CategoricalNB()
    
    clf.fit(Xtrain, ytrain)

    # 在测试集上执行预测，proba导出的是每个样本属于某类的概率
    clf.predict(Xtest) # 预测样本的分类
    clf.predict_proba(Xtest) # 预测样本对应类别的概率

    #测试准确率
    predict_accuracy=accuracy_score(ytest, clf.predict(Xtest))
    print(f"predict_accuracy = {predict_accuracy}")

In [12]:
skl_bayes(iris_x, iris_y)

predict_accuracy = 0.9777777777777777


In [13]:
skl_bayes(watermelon_x, watermelon_y, True)

predict_accuracy = 0.8333333333333334
