In [3]:
from sklearn import datasets
import numpy as np
import pandas as pd

# breast_cancer

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html


| Column Number | English Term             | Traditional Chinese Term   | Description                                                          |
|---------------|--------------------------|-----------------------------|----------------------------------------------------------------------|
| 0             | mean radius              | 平均半徑                     | Average of distances from center to points on the perimeter          |
| 1             | mean texture             | 平均紋理                     | Standard deviation of gray-scale values                             |
| 2             | mean perimeter           | 平均周長                     | Average size of the core tumor perimeter                            |
| 3             | mean area                | 平均面積                     | Area of the tumor                                                    |
| 4             | mean smoothness          | 平均光滑度                   | Measure of local variation in radius lengths                         |
| 5             | mean compactness         | 平均緻密度                   | Perimeter^2 / area - 1.0                                             |
| 6             | mean concavity           | 平均凹度                     | Severity of concave portions of the contour                          |
| 7             | mean concave points      | 平均凹點                     | Number of concave portions of the contour                            |
| 8             | mean symmetry            | 平均對稱性                   | Symmetry of the tumor                                                |
| 9             | mean fractal dimension   | 平均分形維度                 | "coastline approximation" - 1                                        |
| 10            | radius error             | 半徑誤差                     | Standard error for the mean of distances from center to points       |
| 11            | texture error            | 紋理誤差                     | Standard error for gray-scale values                                 |
| 12            | perimeter error          | 周長誤差                     | Standard error for size of the perimeter                             |
| 13            | area error               | 面積誤差                     | Standard error for the area of the tumor                             |
| 14            | smoothness error         | 光滑度誤差                   | Standard error for local variation in radius lengths                 |
| 15            | compactness error        | 緻密度誤差                   | Standard error for perimeter^2 / area - 1.0                          |
| 16            | concavity error          | 凹度誤差                     | Standard error for severity of concave portions of the contour       |
| 17            | concave points error     | 凹點誤差                     | Standard error for number of concave portions of the contour         |
| 18            | symmetry error           | 對稱性誤差                   | Standard error for symmetry                                          |
| 19            | fractal dimension error  | 分形維度誤差                 | Standard error for the "coastline approximation" - 1                 |
| 20            | worst radius             | 最差半徑                     | "Worst" or largest mean value for distances from center to points    |
| 21            | worst texture            | 最差紋理                     | "Worst" or largest mean value for gray-scale values                  |
| 22            | worst perimeter          | 最差周長                     | "Worst" or largest mean value for perimeter size                     |
| 23            | worst area               | 最差面積                     | "Worst" or largest mean value for area                               |
| 24            | worst smoothness         | 最差光滑度                   | "Worst" or largest mean value for local variation in radius lengths  |
| 25            | worst compactness        | 最差緻密度                   | "Worst" or largest mean value for perimeter^2 / area - 1.0           |
| 26            | worst concavity          | 最差凹度                     | "Worst" or largest mean value for severity of concave portions       |
| 27            | worst concave points     | 最差凹點                     | "Worst" or largest mean value for number of concave portions         |
| 28            | worst symmetry           | 最差對稱性                   | "Worst" or largest mean value for symmetry                           |
| 29            | worst fractal dimension  | 最差分形維度                 | "Worst" or largest mean value for "coastline approximation" - 1      |
| 30            | target                   | 目標                         | Diagnosis (1 = malignant, 0 = benign)                                |


In [4]:
dta = datasets.load_breast_cancer()

data = np.column_stack((dta.data, dta.target))
feature_name = np.concatenate((dta.feature_names, ["target"]), axis=0)

print(dta.target_names)

df = pd.DataFrame(data, columns=feature_name); df.head()

['malignant' 'benign']


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


# iris

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html#sklearn.datasets.load_iris

| Column Number | English Term       | Traditional Chinese | Description                                       |
|---------------|--------------------|---------------------|---------------------------------------------------|
| 0             | sepal length (cm)  | 花萼長度（公分）       | Length of the sepal measured in centimeters.      |
| 1             | sepal width (cm)   | 花萼寬度（公分）       | Width of the sepal measured in centimeters.       |
| 2             | petal length (cm)  | 花瓣長度（公分）       | Length of the petal measured in centimeters.      |
| 3             | petal width (cm)   | 花瓣寬度（公分）       | Width of the petal measured in centimeters.       |
| 4             | target             | 目標                  | Species of Iris (0 = setosa, 1 = versicolor, 2 = virginica) |


In [5]:
dta = datasets.load_iris()

data = np.column_stack((dta.data, dta.target))
feature_name = np.concatenate((dta.feature_names, ["target"]), axis=0)

print(dta.target_names)

df = pd.DataFrame(data, columns=feature_name); df.head()

['setosa' 'versicolor' 'virginica']


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


# diabetes

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes


| Column Number | English Term | Traditional Chinese | Description                                |
|---------------|--------------|---------------------|--------------------------------------------|
| 0             | age          | 年齡                 | Age of the patient                        |
| 1             | sex          | 性別                 | Biological sex of the patient             |
| 2             | bmi          | 體質指數             | Body mass index                           |
| 3             | bp           | 血壓                 | Average blood pressure                    |
| 4             | s1           | 總膽固醇             | Total cholesterol level                   |
| 5             | s2           | 低密度脂蛋白膽固醇   | Low-density lipoprotein cholesterol       |
| 6             | s3           | 高密度脂蛋白膽固醇   | High-density lipoprotein cholesterol      |
| 7             | s4           | 甲狀腺素水平         | Thyroid stimulating hormone level         |
| 8             | s5           | 肌酐水平             | Serum creatinine level                    |
| 9             | s6           | 血糖水平             | Blood sugar level                         |
| 10            | target       | 目標                 | Quantitative measure of disease progression |


In [6]:
dta = datasets.load_diabetes()

data = np.column_stack((dta.data, dta.target))
feature_name = np.concatenate((dta.feature_names, ["target"]), axis=0)

# print(dta.target_names)

df = pd.DataFrame(data, columns=feature_name); df.head()



Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


# digits

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits

In [7]:
dta = datasets.load_digits()

data = np.column_stack((dta.data, dta.target))
feature_name = np.concatenate((dta.feature_names, ["target"]), axis=0)

print(dta.target_names)

df = pd.DataFrame(data, columns=feature_name); df.head()



[0 1 2 3 4 5 6 7 8 9]


Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1797 entries, 0 to 1796
Data columns (total 65 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pixel_0_0  1797 non-null   float64
 1   pixel_0_1  1797 non-null   float64
 2   pixel_0_2  1797 non-null   float64
 3   pixel_0_3  1797 non-null   float64
 4   pixel_0_4  1797 non-null   float64
 5   pixel_0_5  1797 non-null   float64
 6   pixel_0_6  1797 non-null   float64
 7   pixel_0_7  1797 non-null   float64
 8   pixel_1_0  1797 non-null   float64
 9   pixel_1_1  1797 non-null   float64
 10  pixel_1_2  1797 non-null   float64
 11  pixel_1_3  1797 non-null   float64
 12  pixel_1_4  1797 non-null   float64
 13  pixel_1_5  1797 non-null   float64
 14  pixel_1_6  1797 non-null   float64
 15  pixel_1_7  1797 non-null   float64
 16  pixel_2_0  1797 non-null   float64
 17  pixel_2_1  1797 non-null   float64
 18  pixel_2_2  1797 non-null   float64
 19  pixel_2_3  1797 non-null   float64
 20  pixel_2_

# linnerud

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_linnerud.html#sklearn.datasets.load_linnerud

| Index | English Name | Chinese Name | Data Type | Description |
|-------|--------------|--------------|-----------|-------------|
| 0     | Chins        | 引體向上      | float64   | Number of chin-ups completed by participants |
| 1     | Situps       | 仰臥起坐      | float64   | Number of sit-ups completed by participants |
| 2     | Jumps        | 跳躍         | float64   | Number of jumps completed by participants |
| 3     | Weight       | 體重         | float64   | Participant's weight in pounds |
| 4     | Waist        | 腰圍         | float64   | Participant's waist size in inches |
| 5     | Pulse        | 脈搏         | float64   | Participant's resting pulse rate per minute |

In [9]:
dta = datasets.load_linnerud()

data = np.column_stack((dta.data, dta.target))
feature_name = np.concatenate((dta.feature_names, dta.target_names), axis=0)

print(dta.target_names)

df = pd.DataFrame(data, columns=feature_name); df.head()



['Weight', 'Waist', 'Pulse']


Unnamed: 0,Chins,Situps,Jumps,Weight,Waist,Pulse
0,5.0,162.0,60.0,191.0,36.0,50.0
1,2.0,110.0,60.0,189.0,37.0,52.0
2,12.0,101.0,101.0,193.0,38.0,58.0
3,12.0,105.0,37.0,162.0,35.0,62.0
4,13.0,155.0,58.0,189.0,35.0,46.0


# wine

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html#sklearn.datasets.load_wine

| 索引 | 英文名稱                        | 繁體中文名稱                  | 數據類型 | 描述                                         |
|------|--------------------------------|------------------------------|----------|---------------------------------------------|
| 0    | Alcohol                        | 酒精                          | float64  | Wine's alcohol content by volume            |
| 1    | Malic Acid                     | 蘋果酸                        | float64  | Amount of malic acid in the wine            |
| 2    | Ash                            | 灰                            | float64  | Ash content in the wine                     |
| 3    | Alkalinity of Ash              | 灰的鹼性                      | float64  | Measure of the alkalinity of ash in the wine|
| 4    | Magnesium                      | 鎂                            | float64  | Magnesium content in the wine               |
| 5    | Total Phenols                  | 總酚                          | float64  | Total phenol content in the wine            |
| 6    | Flavanoids                     | 黃烷醇                        | float64  | Flavanoid content in the wine               |
| 7    | Nonflavanoid Phenols           | 非黃烷醇酚                     | float64  | Nonflavanoid phenol content in the wine     |
| 8    | Proanthocyanins                | 原花青素                      | float64  | Proanthocyanin content in the wine          |
| 9    | Color Intensity                | 顏色強度                      | float64  | Intensity of the wine's color               |
| 10   | Hue                            | 色調                          | float64  | The hue of the wine                         |
| 11   | OD280/OD315 of Diluted Wines   | 稀釋葡萄酒的 OD280/OD315 比值 | float64  | Absorbance ratio indicating protein concentration |
| 12   | Proline                        | 脯氨酸                        | float64  | Proline content in the wine                 |
| 13   | Target                         | 目標                          | float64  | The classification target of the wine       |

In [10]:
dta = datasets.load_wine()

data = np.column_stack((dta.data, dta.target))
feature_name = np.concatenate((dta.feature_names, ['target']), axis=0)

print(dta.target_names)

df = pd.DataFrame(data, columns=feature_name); df.head()



['class_0' 'class_1' 'class_2']


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0


# boston


## 資料集

| 英文 | 中文 | 描述 |
|------|------|------|
| CRIM     | 犯罪率           | 城鎮人均犯罪率。           |
| ZN       | 住宅用地比例     | 占地面積超過 25000 平方英尺的住宅用地比例。           |
| INDUS    | 非零售商用地比例 | 每鎮非零售商用地比例。           |
| CHAS     | 查爾斯河虛擬變量 | 是否位於查爾斯河邊（1 是，0 否）。           |
| NOX      | 氮氧化物濃度     | 一氧化氮濃度（每百萬份）。           |
| RM       | 平均房間數       | 每棟住宅的平均房間數。           |
| AGE      | 老房比例         | 1940 年之前建造的自住單位比例。           |
| DIS      | 距離             | 距離五個波士頓就業中心的加權距離。           |
| RAD      | 公路通達性       | 公路通達性指數。           |
| TAX      | 財產稅率         | 每一萬美元的全值財產稅率。           |
| PTRATIO  | 師生比例         | 鎮上學生與教師的比例。           |
| B        | 黑人比例         | 1000(Bk - 0.63)^2，其中 Bk 是每鎮的黑人比例。           |
| LSTAT    | 低收入比例       | 低收入人群所占比例。           |
| MEDV     | 房價中位數       | 自住單位房價的中位數，以千美元計。           |

## 数据描述

- **CRIM**: 表示城鎮人均犯罪率。高犯罪率通常與房價下降相關。
- **ZN**: 表示占地面積超過 25000 平方英尺的住宅用地比例。較高的比例可能表明較高的房價。
- **INDUS**: 表示每鎮非零售商用地比例。非零售商用地比例較高的地區通常房價較低。
- **CHAS**: 這是一個虛擬變量，表示是否位於查爾斯河邊。靠近查爾斯河的房子通常較貴。
- **NOX**: 表示一氧化氮濃度。較高的污染水平通常與較低的房價相關。
- **RM**: 表示每棟住宅的平均房間數。更多的房間通常意味着更高的房價。
- **AGE**: 表示1940年之前建造的自住單位比例。老房子比例較高的地區可能房價較低。
- **DIS**: 表示距離五個波士頓就業中心的加權距離。較遠的距離可能意味着較低的房價。
- **RAD**: 表示公路通達性指數。較高的通達性指數可能意味着更高的房價。
- **TAX**: 表示每一萬美元的全值財產稅率。較高的財產稅率通常與較高的房價相關。
- **PTRATIO**: 表示鎮上學生與教師的比例。較低的師生比例（即更多的教師）通常與較高的房價相關。
- **B**: 表示 1000(Bk - 0.63)^2，其中 Bk 是每鎮的黑人比例。這個特徵在現代使用中可能存在爭議。
- **LSTAT**: 表示低收入人群所占比例。較高的低收入比例通常與較低的房價相關。
- **MEDV**: 表示自住單位房價的中位數，以千美元計。這是目標變量，表示房價的中位數。


In [18]:
from sklearn.datasets import fetch_openml

boston = fetch_openml(name='boston', version=1)

data = np.column_stack((boston.data, boston.target))
feature_name = np.concatenate((boston.feature_names, ['MEDV']), axis=0)

print(boston.target_names)

df = pd.DataFrame(data, columns=feature_name); df.head()

['MEDV']


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2
