In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# load the datasets
iris = load_iris()
X, y = iris.data, iris.target

In [6]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selector', SelectKBest(score_func=f_classif, k = 2)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [9]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [10]:
# Make a prediction on the test data
y_predict = pipeline.predict(X_test)

In [12]:
# Evaluate the performance of the model
accuracy = accuracy_score(y_test, y_predict)
report = classification_report(y_test, y_predict)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', report)

Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In this example, the pipeline includes the following steps:

StandardScaler: Standardizes the feature values by removing the mean and scaling to unit variance.

SelectKBest: Selects the top k features based on the ANOVA F-statistic. In this case, it selects the top 2 features.

RandomForestClassifier: Uses a Random Forest classifier for the final classification task.

The pipeline ensures that the preprocessing steps and model training are applied consistently. This is particularly useful when deploying models in production, as the entire pipeline can be easily saved, loaded, and applied to new data. Additionally, parameter tuning and cross-validation can be integrated into the pipeline for a more comprehensive model development process.