# Using Pipelines for Imputation and Scaling in Preprocessing

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

## 載入數據集
columns = ["sex", "length", "diam", "height", "whole", "shucked", "viscera", "shell", "age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", names=columns)

## 定義目標變數和預測變數
y = df.age  # 目標變數 (age)
X = df.drop(columns=['age'])  # 特徵變數 (排除 'age' 列)

## 載入數值型和類別型特徵
num_cols = X.select_dtypes(include=np.number).columns  # 數值型特徵
cat_cols = X.select_dtypes(include=['object']).columns  # 類別型特徵

## 模擬一些缺失值
for i in range(1000):  # 隨機在特徵中引入缺失值
    X.loc[np.random.choice(X.index), np.random.choice(X.columns)] = np.nan

## 切分訓練集和測試集
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

#####-------使用 Pipeline 處理缺失值填補和標準化的代碼-----------------#####

# 1. 使用 Pipeline 處理缺失值填補和數據標準化
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='mean')),  # 使用平均值填補數值型特徵的缺失值
    ("scale", StandardScaler())  # 對數值型特徵進行標準化
])

# 2. 在訓練集上擬合並轉換數據，然後在測試集上進行轉換
pipeline.fit(x_train[num_cols])  # 在訓練集上擬合填補和標準化步驟
x_transform = pipeline.transform(x_test[num_cols])  # 在測試集上轉換數據

# 3. 使用 np.array_equal() 比較 Pipeline 轉換後的測試集與手動處理的結果是否相同
array_diff = np.array_equal(x_transform, x_test_fill_missing_scale)
print("Is the transformed test set data the same:", array_diff)

# 4. 改為使用中位數策略來填補缺失值
pipeline_median = Pipeline([
    ("imputer", SimpleImputer(strategy='median')),  # 使用中位數填補數值型特徵的缺失值
    ("scale", StandardScaler())  # 仍然對數值型特徵進行標準化
])

# 5. 在使用中位數填補的 pipeline 上擬合並轉換數據
pipeline_median.fit(x_train[num_cols])
x_transform_median = pipeline_median.transform(x_test[num_cols])

# 6. 比較兩個 pipeline 的結果差異
# 計算兩個轉換結果的絕對差異的總和
new_array_diff = abs(x_transform - x_transform_median).sum()
print("Sum of differences between the two pipeline transformations:", new_array_diff)

#####-------處理過程解釋-----------------#####

# 1. 在第一部分，我們使用了 `Pipeline` 來簡化流程，包括缺失值填補和標準化兩個步驟。
#    - `SimpleImputer(strategy='mean')`: 填補數值型特徵中的缺失值，使用平均值進行填補。
#    - `StandardScaler()`: 對數值型特徵進行標準化，將其變換為均值為 0，方差為 1 的數據。

# 2. 然後，我們使用 `Pipeline.fit()` 在訓練數據上擬合變換器，並且使用 `Pipeline.transform()` 來變換測試數據。

# 3. 之後，我們將轉換後的測試集數據與手動填補並標準化的數據進行比較，檢查是否一致。

# 4. 接著，我們將 `SimpleImputer` 的填補策略從「均值」改為「中位數」，並進行相同的步驟。

# 5. 最後，我們比較了兩個管道的結果差異，計算了轉換後的結果的絕對差異總和。

Is the transformed test set data the same: False
Sum of differences between the two pipeline transformations: 44.06731100232763


# Data Preprocessing and Pipelines(Categorical)

In [13]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 載入資料集並定義欄位名稱
columns = ["sex", "length", "diam", "height", "whole", "shucked", "viscera", "shell", "age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", names=columns)

# 定義目標變數y及預測變數X
y = df.age
X = df.drop(columns=['age'])

# 找出數值型欄位及類別型欄位
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns

# 隨機插入缺失值
for i in range(1000):
    X.loc[np.random.choice(X.index), np.random.choice(X.columns)] = np.nan

# 分割資料為訓練集與測試集
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

# 從訓練集中擷取類別型欄位
x_train_cat = x_train[cat_cols]

# 用類別型欄位的眾數來填補缺失值
x_train_fill_missing = x_train_cat.fillna(x_train_cat.mode().values[0][0])

# 對填補後的資料進行One-Hot編碼
ohe = OneHotEncoder(sparse_output=False, drop='first').fit(x_train_fill_missing)
# 轉換填補後的資料
x_train_fill_missing_ohe = ohe.transform(x_train_fill_missing)

# 針對測試集做相同處理
x_test_fill_missing = x_test[cat_cols].fillna(x_train_cat.mode().values[0][0])
x_test_fill_missing_ohe = ohe.transform(x_test_fill_missing)

# 1. 使用Pipeline重構
pipeline = Pipeline([("imputer", SimpleImputer(strategy='most_frequent')), 
                     ("ohe", OneHotEncoder(sparse_output=False, drop='first'))])

# 2. 擬合Pipeline並轉換測試資料（僅針對類別型欄位）
pipeline.fit(x_train[cat_cols])
x_transform = pipeline.transform(x_test[cat_cols])

# 3. 使用np.array_equal()檢查兩個陣列是否相等
check_arrays = np.array_equal(x_transform, x_test_fill_missing_ohe)

# 顯示結果
print('Are the arrays equal?')
print(check_arrays)

Are the arrays equal?
True


# Data Preprocessing with Pipelines and ColumnTransformer

In [18]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)

y = df.age  # 目標變數（年齡）
X = df.drop(columns=['age'])  # 特徵變數（去掉年齡）
num_cols = X.select_dtypes(include=np.number).columns  # 數值型欄位
cat_cols = X.select_dtypes(include=['object']).columns  # 類別型欄位

# 創建一些缺失值
for i in range(1000):
    X.loc[np.random.choice(X.index), np.random.choice(X.columns)] = np.nan

# 訓練測試集劃分
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

# 1. 創建一個處理數值型資料的pipeline `num_vals`
num_vals = Pipeline([
    ("imputer", SimpleImputer()),  # 用均值填補缺失值
    ("scale", StandardScaler())    # 標準化數值資料
])

# 2. 創建一個處理類別型資料的pipeline `cat_vals`
cat_vals = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),  # 用最頻繁的值填補缺失值
    ("ohe", OneHotEncoder(drop='first', sparse_output=False))      # 用One-Hot Encoding處理類別資料
])

# 3. 創建一個ColumnTransformer `preprocess`，將數值型和類別型的處理步驟整合
preprocess = ColumnTransformer(
    transformers=[
        ("num_preprocess", num_vals, num_cols),  # 處理數值型資料
        ("cat_preprocess", cat_vals, cat_cols)   # 處理類別型資料
    ]
)

# 4. 擬合preprocess轉換器到訓練資料
preprocess.fit(x_train)

# 對測試資料進行轉換
x_transform = preprocess.transform(x_test)

# Creating and Evaluating a Machine Learning Pipeline for Regression with Preprocessing Steps

In [20]:
import numpy as np
import pandas as pd

from sklearn import svm, datasets
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

# 讀取數據並設定列名稱
columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", names=columns)

# 分離特徵 (X) 和標籤 (y)
y = df.age
X = df.drop(columns=['age'])

# 獲取數值型和類別型特徵
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns

# 創建一些缺失值
for i in range(1000):
    X.loc[np.random.choice(X.index), np.random.choice(X.columns)] = np.nan

# 分割數據集為訓練集和測試集
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

# 定義處理類別變量的管道 (填補缺失值並進行OneHot編碼)
cat_vals = Pipeline([("imputer", SimpleImputer(strategy='most_frequent')), 
                     ("ohe", OneHotEncoder(sparse_output=False, drop='first'))])

# 定義處理數值變量的管道 (填補缺失值並標準化)
num_vals = Pipeline([("imputer", SimpleImputer(strategy='mean')), 
                     ("scale", StandardScaler())])

# 創建列轉換器 (ColumnTransformer)，對數值和類別變量分別進行處理
preprocess = ColumnTransformer(
    transformers=[
        ("cat_process", cat_vals, cat_cols),
        ("num_process", num_vals, num_cols)
    ]
)

# 1. 創建一個包含數據預處理和線性回歸模型的管道
pipeline = Pipeline([("preprocess", preprocess), 
                     ("regr", LinearRegression())])

# 2. 用訓練數據擬合管道並對測試數據進行預測
pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)

# 3. 計算並比較管道得分與預測結果的R-squared得分
# 計算管道的得分
pipeline_score = pipeline.score(x_test, y_test)
print(pipeline_score)

# 計算R-squared得分
r2 = r2_score(y_test, y_pred)
print(r2)

0.49329870824922517
0.49329870824922517


# Pipeline for Regression Models with Hyperparameter Tuning, Missing Value Imputation, and Scaling in Machine Learning

In [22]:
import numpy as np
import pandas as pd
from sklearn import svm, datasets
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin

# 讀取資料集並命名欄位
columns = ["sex", "length", "diam", "height", "whole", "shucked", "viscera", "shell", "age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", names=columns)

# 分離特徵與標籤
y = df.age
X = df.drop(columns=['age'])

# 數值型和類別型特徵的分離
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns

# 創建一些缺失值
for i in range(1000):
    X.loc[np.random.choice(X.index), np.random.choice(X.columns)] = np.nan

# 訓練集與測試集的切割
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

# 類別型數據的預處理流程（缺失值補全 + One-Hot編碼）
cat_vals = Pipeline([("imputer", SimpleImputer(strategy='most_frequent')),
                     ("ohe", OneHotEncoder(sparse_output=False, drop='first'))])

# 數值型數據的預處理流程（缺失值補全 + 標準化）
num_vals = Pipeline([("imputer", SimpleImputer(strategy='mean')), 
                     ("scale", StandardScaler())])

# 綜合處理流程
preprocess = ColumnTransformer(
    transformers=[
        ("cat_preprocess", cat_vals, cat_cols),
        ("num_preprocess", num_vals, num_cols)
    ]
)

# 創建管道：包括預處理步驟和回歸模型（線性回歸）
pipeline = Pipeline([("preprocess", preprocess), 
                     ("regr", LinearRegression())])

# 超參數的搜尋範圍：包括不同回歸模型的選擇（LinearRegression, Ridge, Lasso）
search_space = [{'regr': [LinearRegression()], 'regr__fit_intercept': [True, False]},
                {'regr': [Ridge()], 'regr__alpha': [0.01, 0.1, 1, 10, 100]},
                {'regr': [Lasso()], 'regr__alpha': [0.01, 0.1, 1, 10, 100]}]

# GridSearchCV 進行交叉驗證，選擇最佳的模型和超參數
gs = GridSearchCV(pipeline, search_space, scoring='neg_mean_squared_error', cv=5)

# 擬合訓練資料
gs.fit(x_train, y_train)

# 輸出最佳得分與最佳參數
best_score = gs.best_score_
best_params = gs.best_params_
print(f"Best score: {best_score}, Best parameters: {best_params}")

# 獲取最佳管道
best_pipeline = gs.best_estimator_

# 找出最佳的回歸模型
best_regression_model = best_pipeline.named_steps['regr']
print("The best regression model is:")
print(best_regression_model)

# 獲取最佳回歸模型的超參數
best_model_hyperparameters = best_regression_model.get_params()
print("The hyperparameters of the regression model are:")
print(best_model_hyperparameters)

# 取得類別型數據處理的超參數
cat_preprocess_hyperparameters = best_pipeline.named_steps['preprocess'].named_transformers_['cat_preprocess'].named_steps['imputer'].get_params()
print("The hyperparameters of the imputer are:")
print(cat_preprocess_hyperparameters)


# 自定義的缺失值填充器類
class MyImputer(BaseEstimator, TransformerMixin): 
    def __init__(self):
        return None
    
    def fit(self, X, y=None):
        self.means = np.mean(X, axis=0)  # 計算每列的均值
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.means)  # 使用均值填充缺失值

# 創建包含自定義缺失值填充器與標準化步驟的管道
new_pipeline = Pipeline([("imputer", MyImputer()), ("scale", StandardScaler())])

# 在訓練集上擬合新管道（僅數值型特徵）
new_pipeline.fit(x_train[num_cols])

# 對測試集進行轉換
x_transform = new_pipeline.transform(x_test[num_cols])

# 檢查轉換後的結果是否與填充缺失值並標準化後的測試集結果相同
check_arrays = np.array_equal(x_transform, x_test_fill_missing_scale)
print(f"Transformation results match: {check_arrays}")

Best score: -5.476916937417327, Best parameters: {'regr': Lasso(alpha=0.01), 'regr__alpha': 0.01}
The best regression model is:
Lasso(alpha=0.01)
The hyperparameters of the regression model are:
{'alpha': 0.01, 'copy_X': True, 'fit_intercept': True, 'max_iter': 1000, 'positive': False, 'precompute': False, 'random_state': None, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': False}
The hyperparameters of the imputer are:
{'add_indicator': False, 'copy': True, 'fill_value': None, 'keep_empty_features': False, 'missing_values': nan, 'strategy': 'most_frequent'}
Transformation results match: False


In [8]:
import datetime

current_date = datetime.datetime.now().strftime("%Y年%m月%d日")
print(f"更新日期: {current_date}")

更新日期: 2024年12月01日
