#Ensemble Techniques


In [3]:
# # Q1. Building a Pipeline for Feature Engineering and Modeling
#Load the Breast Cancer Dataset from scikit-learn.
# Preprocess Data: Handle missing values (if any), scale numerical features, and encode categorical features (if any).
# Feature Selection: Use SelectFromModel to automatically select important features.
# Build a Random Forest Classifier: Train the classifier on the selected features.
# Evaluate the Model: Use accuracy as the metric to evaluate the classifier's performance.
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load the Breast Cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

# Step 2: Preprocess Data
# No missing values handling needed for this dataset, but we'll scale the features
numeric_features = X.select_dtypes(include=np.number).columns
preprocessor = Pipeline([
    ('scaler', StandardScaler())
])

X_processed = preprocessor.fit_transform(X)

# Step 3: Feature Selection using SelectFromModel
# Example using RandomForestClassifier as the estimator for feature selection
feature_selector = SelectFromModel(RandomForestClassifier(random_state=42))
X_selected = feature_selector.fit_transform(X_processed, y)

# Step 4: Build a Random Forest Classifier
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Step 5: Evaluate the Model
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Random Forest Classifier: {accuracy:.4f}")


Accuracy of Random Forest Classifier: 0.9561


In [None]:
# Explanation:
# Step 1: We load the Breast Cancer dataset using load_breast_cancer() function from scikit-learn. This dataset is already cleaned and formatted nicely, so we can directly work with it.

# Step 2: In the preprocessing step, we scale the numerical features using StandardScaler. No missing values are handled here because the dataset is complete.

# Step 3: We use SelectFromModel with RandomForestClassifier as the estimator to select important features based on their contribution to the model's performance.

# Step 4: We build a Random Forest Classifier and train it on the selected features (X_selected).

# Step 5: Finally, we evaluate the model's accuracy on the test set (X_test and y_test).

In [4]:
# 2)Building a Pipeline with Voting Classifier
# In this example, we'll create a pipeline that includes a Voting Classifier with three different classifiers: Logistic Regression, Decision Tree Classifier, and Support Vector Classifier (SVC).

# Steps:
# Load the Dataset: We'll use the Breast Cancer Wisconsin (Diagnostic) Dataset from scikit-learn, similar to the previous example.

# Preprocess Data: Handle missing values (if any), scale numerical features, and encode categorical features (if any).

# Build Individual Classifiers: Create instances of Logistic Regression, Decision Tree Classifier, and SVC.

# Build a Voting Classifier: Combine the individual classifiers using Voting Classifier.

# Train and Evaluate the Model: Train the pipeline on the training set and evaluate its performance on the test set.
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Step 1: Load the Breast Cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

# Step 2: Preprocess Data
# No missing values handling needed for this dataset, but we'll scale the features
numeric_features = X.select_dtypes(include=np.number).columns
preprocessor = Pipeline([
    ('scaler', StandardScaler())
])

X_processed = preprocessor.fit_transform(X)

# Step 3: Build Individual Classifiers
logistic_regression = LogisticRegression(random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
svc = SVC(kernel='linear', random_state=42)

# Step 4: Build a Voting Classifier
voting_classifier = VotingClassifier(
    estimators=[('lr', logistic_regression), ('dt', decision_tree), ('svc', svc)],
    voting='hard'  # Use 'hard' voting for classification
)

# Step 5: Build the Pipeline with the Voting Classifier
pipeline_voting = Pipeline([
    ('preprocessor', preprocessor),  # Reuse preprocessor from previous example
    ('classifier', voting_classifier)
])

# Step 6: Train-test split and Model Evaluation
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline_voting.fit(X_train, y_train)

# Predict on the test data
y_pred_voting = pipeline_voting.predict(X_test)

# Evaluate the model
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f"Accuracy of the Voting Classifier: {accuracy_voting:.4f}")


Accuracy of the Voting Classifier: 0.9649


In [5]:
# Explanation:
# Step 1: We load the Breast Cancer dataset using load_breast_cancer() function from scikit-learn. This dataset is already cleaned and formatted nicely.

# Step 2: In the preprocessing step, we scale the numerical features using StandardScaler. No missing values are handled here because the dataset is complete.

# Step 3: We instantiate three individual classifiers: Logistic Regression, Decision Tree Classifier, and SVC with a linear kernel.

# Step 4: We build a Voting Classifier (voting_classifier) that includes these three classifiers. Here, we use 'hard' voting to make predictions based on the majority class label predicted by each classifier.

# Step 5: We create a pipeline (pipeline_voting) that includes the preprocessing steps and the Voting Classifier.

# Step 6: We split the data into training and test sets, fit the pipeline on the training data, predict on the test data, and evaluate the model's accuracy using accuracy_score.