# sklearn


## decision tree

单棵决策树，作为一个对照实验

使用的是数据集[wine](./dataset/wine/wine.names)

In [1]:
# import dataset
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def load_data():
    DATA_PATH = "./dataset/wine"
    wine_data = pd.read_csv(os.path.join(DATA_PATH, "wine.data"), header=None)
    columns = [
        "class",
        "Alcohol",
        "Malic acid",
        "Ash",
        "Alcalinity of ash",
        "Magnesium",
        "Total phenols",
        "Flavanoids",
        "Nonflavanoid phenols",
        "Proanthocyanins",
        "Color intensity",
        "Hue",
        "OD280/OD315 of diluted wines",
        "Proline",
    ]
    wine_data.columns = columns

    # split dataset into train set and test set
    X_train, X_test, y_train, y_test = train_test_split(
        wine_data.iloc[:, 1:], wine_data.iloc[:, 0], test_size=0.2, random_state=11
    )
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)
    return X_train, X_test, y_train, y_test

# load data
X_train, X_test, y_train, y_test = load_data()
X_train, X_test, y_train, y_test = (
    X_train.to_numpy(),
    X_test.to_numpy(),
    y_train.to_numpy(),
    y_test.to_numpy(),
)

In [39]:
print(f"train set size: {X_train.shape}, test set size: {X_test.shape}")

train set size: (142, 13), test set size: (36, 13)


In [42]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

decision_clf = DecisionTreeClassifier(max_depth=4, random_state=42)

decision_clf.fit(X_train, y_train)
decision_accuracy = accuracy_score(y_test, decision_clf.predict(X_test))
decision_f1_score = f1_score(y_test, decision_clf.predict(X_test), average='micro')
print("Decision tree with max_depth=4:")
print(f"accuracy on testing set: {decision_accuracy}")
print(f"f1 score on testing set: {decision_f1_score}")

Decision tree with max_depth=4:
accuracy on testing set: 0.9444444444444444
f1 score on testing set: 0.9444444444444444


## AdaBoost


因为基学习器base learner应该为weak learner,如果decision tree作为base learner, max_depth=4时效果太好，不适合。所以我们采用max_depth=2的decision tree作为基学习器。

In [54]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score

weak_learner = DecisionTreeClassifier(max_depth=2)
adaboost_clf = AdaBoostClassifier(
    estimator=weak_learner, 
    n_estimators=10, 
    random_state=33, 
    algorithm="SAMME.R"
    # SAMME.R is an enhanced version of SAMME, 
    # SAMME is a variant class classifier from AdaBoost.
)
adaboost_clf.fit(X_train, y_train)

y_pred = adaboost_clf.predict(X_test)
adaboost_accuracy = accuracy_score(y_test, y_pred)
adaboost_f1_score = f1_score(y_test, y_pred, average="micro")
print("Accuracy:", adaboost_accuracy)
print("F1 score:", adaboost_f1_score)

Accuracy: 0.9444444444444444
F1 score: 0.9444444444444444




## Bagging

In [63]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

bagging_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42, max_depth=2),
    random_state=42,
    n_estimators=10,
    max_samples=0.8,
)
bagging_clf.fit(X_train, y_train)
y_pred = bagging_clf.predict(X_test)
print("Bagging accuracy:", accuracy_score(y_test, y_pred))
print("Bagging F1 score:", f1_score(y_test, y_pred, average="micro"))

Bagging accuracy: 0.9722222222222222
Bagging F1 score: 0.9722222222222222


## Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

rf_clf = RandomForestClassifier(
    max_depth=2,
    random_state=42,
    max_features='sqrt',
    n_estimators=10
)

rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)

print('Random Forest Classifier:')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred, average='micro'))

Random Forest Classifier:
Accuracy: 1.0
F1 Score: 1.0


可见RF的效果特别好

# Numpy

自己写一个random forest的实现

In [2]:
X_train, X_test, y_train, y_test = load_data()

In [3]:
from Numpy import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier(
    max_depth=2, max_features="sqrt", n_estimators=100, replacement=False
)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
