In [16]:
from sklearn import datasets

# 加载鸢尾花数据集
iris = datasets.load_iris()

In [17]:
import polars as pl

# 读取数据集数据
df = pl.DataFrame(iris.data)  # type: ignore

# 设置数据集列名, 即特征名称, 包括:
# - sepal length (cm): 萼片长度
# - sepal width (cm): 萼片宽度
# - petal length (cm): 花瓣长度
# - petal width (cm): 花瓣宽度
df.columns = iris.feature_names  # type: ignore

df

sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
f64,f64,f64,f64
5.1,3.5,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,1.3,0.2
4.6,3.1,1.5,0.2
5.0,3.6,1.4,0.2
…,…,…,…
6.7,3.0,5.2,2.3
6.3,2.5,5.0,1.9
6.5,3.0,5.2,2.0
6.2,3.4,5.4,2.3


In [18]:
x = iris.data  # type: ignore
y = iris.target  # type: ignore

# 显示数据集总数
assert len(x) == len(y), "数据集特征和标签长度不匹配"
print(f"数据集总数: {len(x)}")

# 显示分类项
print(f"分类项: {iris.target_names}")  # type: ignore

数据集总数: 150
分类项: ['setosa' 'versicolor' 'virginica']


In [19]:
from sklearn.model_selection import train_test_split

# 将数据集拆分为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.3,
    random_state=1,
    stratify=y,
)

# 输出训练集长度
assert len(x_train) == len(y_train), "训练集特征和标签长度不匹配"
print(f"训练集特征长度: {len(x_train)}")

# 输出测试集长度
assert len(x_test) == len(y_test), "测试集特征和标签长度不匹配"
print(f"测试集特征长度: {len(x_test)}")

训练集特征长度: 105
测试集特征长度: 45


In [20]:
df = pl.DataFrame(x_train)  # type: ignore
df.columns = iris.feature_names  # type: ignore

df

sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
f64,f64,f64,f64
5.5,4.2,1.4,0.2
5.4,3.4,1.7,0.2
6.4,3.2,5.3,2.3
6.7,3.3,5.7,2.1
5.0,3.2,1.2,0.2
…,…,…,…
4.5,2.3,1.3,0.3
5.8,2.6,4.0,1.2
5.0,3.4,1.6,0.4
4.6,3.1,1.5,0.2


In [21]:
df = pl.DataFrame(x_test)  # type: ignore
df.columns = iris.feature_names  # type: ignore

df

sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
f64,f64,f64,f64
6.2,3.4,5.4,2.3
5.4,3.9,1.7,0.4
4.6,3.4,1.4,0.3
4.9,2.5,4.5,1.7
6.6,3.0,4.4,1.4
…,…,…,…
6.9,3.2,5.7,2.3
5.9,3.0,5.1,1.8
6.5,2.8,4.6,1.5
5.0,3.4,1.5,0.2


In [22]:
from sklearn.preprocessing import StandardScaler


sc = StandardScaler()  # 定义一个标准缩放器
sc.fit(x_train)  # 计算均值, 标准差

x_train_std = sc.transform(x_train)  # 使用计算出的均值和标准差进行标准化
x_test_std = sc.transform(x_test)  # 使用计算出的均值和标准差进行标准化


In [23]:
df = pl.DataFrame(x_train_std)  # type: ignore
df.columns = iris.feature_names  # type: ignore

df

sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
f64,f64,f64,f64
-0.430106,2.631185,-1.332697,-1.307284
-0.550536,0.769184,-1.16538,-1.307284
0.653762,0.303684,0.84243,1.445879
1.015051,0.536434,1.06552,1.183673
-1.032255,0.303684,-1.444242,-1.307284
…,…,…,…
-1.634404,-1.791068,-1.38847,-1.176181
-0.068817,-1.092817,0.117388,0.003746
-1.032255,0.769184,-1.221152,-1.045078
-1.513975,0.070933,-1.276925,-1.307284


In [24]:
df = pl.DataFrame(x_test_std)  # type: ignore
df.columns = iris.feature_names  # type: ignore

df

sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
f64,f64,f64,f64
0.412902,0.769184,0.898203,1.445879
-0.550536,1.932935,-1.16538,-1.045078
-1.513975,0.769184,-1.332697,-1.176181
-1.152685,-1.325568,0.39625,0.659261
0.894621,-0.161817,0.340478,0.265952
…,…,…,…
1.255911,0.303684,1.06552,1.445879
0.051613,-0.161817,0.730885,0.790364
0.774192,-0.627317,0.452023,0.397055
-1.032255,0.769184,-1.276925,-1.307284


In [25]:
from sklearn.svm import SVC

svm = SVC(
    kernel="linear",
    C=1.0,
    decision_function_shape="ovr",
)

svm.fit(x_train_std, y_train)

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [26]:
y_pred = svm.predict(x_test_std)  # 用训练好的分类器 svm 预测数据 x_test_std 的标签

print(f"错误分类样本数: {(y_test != y_pred).sum()}")  # 输出错误分类的样本数
print(f"分类准确率: {svm.score(x_test_std, y_test):.2f}")  # 输出分类准确率

错误分类样本数: 1
分类准确率: 0.98


In [27]:
svm = SVC(
    kernel="linear",
    C=1.5,
    decision_function_shape="ovr",
)

svm.fit(x_train, y_train)

0,1,2
,C,1.5
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [28]:
y_pred = svm.predict(x_test)  # 用训练好的分类器 svm 预测数据 x_test_std 的标签

print(f"错误分类样本数: {(y_test != y_pred).sum()}")  # 输出错误分类的样本数
print(f"分类准确率: {svm.score(x_test, y_test):.2f}")  # 输出分类准确率

错误分类样本数: 0
分类准确率: 1.00
