In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
x = iris.data
y = iris.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [2]:
### 使用模块内置的分类器
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

clf = KNeighborsClassifier()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)

acc = accuracy_score(pred, y_test)

print('Accuracy of KNClassifier:', acc)

Accuracy of KNClassifier: 0.9555555555555556


### Python实现KNN分类器

In [3]:
##  自写分类器，被测样本点，和它距离最近的点为同一类
from scipy.spatial import distance


def euc(a, b):
    # 欧式距离
    return distance.euclidean(a, b)


class ScrappyKNN():

    # fit接口
    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    # predict接口
    def predict(self, x_test):
        self.x_test = x_test
        predictions = []
        for row in x_test:
            label = self.closest(row)
            predictions.append(label)
        return (predictions)

    def closest(self, row):
        # x_train中与row最近的点，与row同一类别
        best_dist = euc(row, self.x_train[0])
        best_index = 0
        for i in range(1, len(x_train)):
            dist = euc(row, self.x_train[i])
            if dist < best_dist:
                best_dist = dist
                best_index = i
        return (self.y_train[best_index])


clf = ScrappyKNN()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
acc = accuracy_score(pred, y_test)
print('Accuracy of MyClassifier:', acc)

Accuracy of MyClassifier: 0.9777777777777777


### Numpy实现的KNN分类器

In [None]:
import numpy as np


def euclidean_distance(x1, x2):
    # distance = 0
    # for i in range(len(x1)):
    #     distance += pow(x1[i] - x2[i], 2)
    # return math.sqrt(distance)
    return np.sqrt(np.sum((x1 - x2)**2))


class KNN():
    def __init__(self, k):
        self.k = k

    def _vote(self, neighbor_labels):
        counts = np.bincount(neighbor_labels.astype('int'))
        return counts.argmax()

    def predict(self, X_test, X_train, y_train):
        y_pred = np.empty(X_test.shape[0])
        for i, test_sample in enumerate(X_test):
            idx = np.argsort(
                [euclidean_distance(test_sample, x) for x in X_train])[:self.k]
            k_nearest_heighbors = np.array([y_train[i] for i in idx])
            y_pred[i] = self._vote(k_nearest_heighbors)

        return y_pred

In [4]:
# 自写分类器，被测点与和它最近的k个点为同一类
import numpy


class KNearestNeighbours:
    def __init__(self, k=5):
        self.k = k

    # fit接口
    def fit(self, x_train, y_train):
        self.x = x_train
        self.y = y_train

    # predict接口
    def predict(self, x_test):
        def EuclidDist(testPoint, checkPint):
            distance = numpy.linalg.norm(testPoint - checkPint)
            return (distance)

        def closest(testPoint):
            distArray = numpy.array(
                [(EuclidDist(testPoint, self.x[i]), self.y[i])
                 for i in range(len(self.x))],
                dtype=[('dist', float), ('lab', int)])
            distArray.sort(order='dist')
            majority = {}
            for j in range(self.k):
                if majority.get(distArray[j][1]) == None:
                    majority[distArray[j][1]] = 0
                else:
                    majority[distArray[j][1]] += 1
            return (max(majority, key=majority.get))

        if x_test.ndim == 1:
            return (closest(x_test))
        else:
            predictions = numpy.array([closest(point) for point in x_test])
            return (predictions)


clf = KNearestNeighbours(k=5)
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
acc = accuracy_score(pred, y_test)
print('Accuracy of My_KNNeighbourClassifier:', acc)

Accuracy of My_KNNeighbourClassifier: 0.9555555555555556


### python实现决策树

In [5]:
# 自写决策树.类似上述fit和predict接口！！！！！！！！！！！！！！！！！！！

# 原始数据
training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]
header = ["color", "diameter", "label"]

In [6]:
# 删除列中特征值重复的值


def unique_vals(rows, col):
    return (set([row[col] for row in rows]))


unique_vals(training_data, 0)

{'Green', 'Red', 'Yellow'}

In [7]:
# 获得训练数据中每一类的样本数


def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return (counts)


class_counts(training_data)

{'Apple': 2, 'Grape': 2, 'Lemon': 1}

In [8]:
# 原始数据中特征值有文本有数字，做出判断
def is_numeric(value):
    return (isinstance(value, int) or isinstance(value, float))


is_numeric(training_data[0][0])

False

In [9]:
# 用来分类的问题


class Question:

    #对每一列按value值做出判断
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return (val >= self.value)
        else:
            return (val == self.value)

    def __repr__(self):
        condition = '=='
        if is_numeric(self.value):
            condition = '>='
        return ('Is %s %s %s?' %
                (header[self.column], condition, str(self.value)))


q = Question(0, 'Green')
print(q)
q.match(training_data[0])

Is color == Green?


True

In [10]:
# 用问题对数据分组


def partition(rows, question):
    true_rows = []
    false_rows = []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return (true_rows, false_rows)


true_rows, false_rows = partition(training_data, Question(0, 'Red'))
print(true_rows, false_rows)

true_rows, false_rows = partition(training_data, Question(1, 2))
print(true_rows, false_rows)

[['Red', 1, 'Grape'], ['Red', 1, 'Grape']] [['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Yellow', 3, 'Lemon']]
[['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Yellow', 3, 'Lemon']] [['Red', 1, 'Grape'], ['Red', 1, 'Grape']]


In [11]:
# 判断分类效果的参数


def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for label in counts:
        prob_of_label = counts[label] / float(len(rows))
        impurity -= prob_of_label**2
    return (impurity)


gini(training_data)

0.6399999999999999

In [12]:
# information gain
# decision tree algorithm:maximize information gain


def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return (current_uncertainty - p * gini(left) - (1 - p) * gini(right))

In [13]:
def find_best_split(rows):
    best_gain = 0
    best_question = None
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1

    for col in range(n_features):
        values = set([row[col] for row in rows])
        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partition(rows, question)
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            if gain >= best_gain:
                best_gain, best_question = gain, question
    return (best_gain, best_question)

In [14]:
class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [15]:
class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [16]:
def build_tree(rows):
    gain, question = find_best_split(rows)
    if gain == 0:
        return (Leaf(rows))
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return (Decision_Node(question, true_branch, false_branch))

In [17]:
def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "predict", node.predictions)
        return
    print(spacing + str(node.question))

    print(spacing + '-->True')
    print_tree(node.true_branch, spacing + ' ')

    print(spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [18]:
my_tree=build_tree(training_data)

In [19]:
print_tree(my_tree)

Is diameter >= 3?
-->True
 Is color == Yellow?
 -->True
  predict {'Apple': 1, 'Lemon': 1}
 --> False:
   predict {'Apple': 1}
--> False:
  predict {'Grape': 2}
