# Ensenmble Techniques


***Ensemble techniques in machine learning function much like seeking advice from multiple sources before making a significant decision***

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [60]:
from sklearn.datasets import make_regression

x , y = make_regression(n_samples=1000 , n_features=2 , noise=0.1 , random_state=42)


In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x , y , test_size = 0.2 , random_state = 0)

In [45]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ridge_regression

# Linear Regression

In [46]:
# Create a LinearRegression model
model = LinearRegression()

# Define the parameters grid for GridSearchCV
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False],
    'copy_X':  [True, False]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

# Evaluate the best model on the test set

Best parameters: {'copy_X': True, 'fit_intercept': False, 'positive': True}
Best score: 0.9999936745842829


In [47]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(copy_X=True , fit_intercept=False , n_jobs=None  , positive=True)
model.fit(X_train, y_train)

In [48]:
y_pred = model.predict(X_test)

In [49]:
# Evaluate
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MAE: 0.07716560591014626
MSE: 0.009381034499037877
RMSE: 0.09685574066124257
R^2: 0.9999939603140058


# Logistic Regression

In [54]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Load data
data = load_iris()
X = data.data
y = (data.target == 2).astype(int)  # Binary classification: class 2 vs. not class 2

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LogisticRegression model
model = LogisticRegression(solver='liblinear')  # Choosing 'liblinear' solver for binary classification

# Define the parameter grid for GridSearchCV
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'fit_intercept': [True, False],
    'solver': ['liblinear'],  # 'liblinear' supports both 'l1' and 'l2' penalties
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
y_pred_best_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate metrics for the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)
roc_auc_best = roc_auc_score(y_test, y_pred_best_proba)

print("\nBest Model Performance:")
print(f"Accuracy: {accuracy_best}")
print(f"Precision: {precision_best}")
print(f"Recall: {recall_best}")
print(f"F1 Score: {f1_best}")
print(f"ROC AUC: {roc_auc_best}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_best)}")

# Train a default LogisticRegression model for comparison
default_model = LogisticRegression(solver='liblinear')
default_model.fit(X_train, y_train)
y_pred_default = default_model.predict(X_test)
y_pred_default_proba = default_model.predict_proba(X_test)[:, 1]

# Calculate metrics for the default model
accuracy_default = accuracy_score(y_test, y_pred_default)
precision_default = precision_score(y_test, y_pred_default)
recall_default = recall_score(y_test, y_pred_default)
f1_default = f1_score(y_test, y_pred_default)
roc_auc_default = roc_auc_score(y_test, y_pred_default_proba)

print("\nDefault Model Performance:")
print(f"Accuracy: {accuracy_default}")
print(f"Precision: {precision_default}")
print(f"Recall: {recall_default}")
print(f"F1 Score: {f1_default}")
print(f"ROC AUC: {roc_auc_default}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_default)}")


Best parameters: {'C': 0.1, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'liblinear'}
Best score: 0.9666666666666668

Best Model Performance:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC AUC: 1.0
Confusion Matrix:
[[19  0]
 [ 0 11]]

Default Model Performance:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC AUC: 1.0
Confusion Matrix:
[[19  0]
 [ 0 11]]




# Ridge regression

In [55]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'fit_intercept': [True, False],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
}

# Create a Ridge regression model
from sklearn.linear_model import Ridge

model = Ridge()


# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

Best parameters: {'alpha': 1, 'fit_intercept': True, 'solver': 'saga'}
Best score: -0.0984996605480198


50 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mdaza\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mdaza\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\mdaza\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_ridge.py", line 1175, in fit
    return super().fit(X, y, sample_weight=sample_weight)
  File "c:\Users\mdaza\A

In [56]:
# Calculate metrics for the best model
mae_best = mean_absolute_error(y_test, y_pred_best)
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = np.sqrt(mse_best)
r2_best = r2_score(y_test, y_pred_best)

print("\nBest Model Performance:")
print(f"MAE: {mae_best}")
print(f"MSE: {mse_best}")
print(f"RMSE: {rmse_best}")
print(f"R^2: {r2_best}")


Best Model Performance:
MAE: 0.23156565864320286
MSE: 0.07182592577638221
RMSE: 0.2680035928423017
R^2: 0.690701755029933


# Voting Classifier

In [67]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Generate a regression dataset
X, y = make_regression(n_samples=1000, n_features=2, noise=0.1, random_state=42)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the base models
models = [
    ('linear_regression', LinearRegression()),
    ('ridge', Ridge())
]

# Create the VotingRegressor
voting_regressor = VotingRegressor(estimators=models)

# Fit the VotingRegressor to the training data
voting_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = voting_regressor.predict(X_test)

# Calculate metrics for the VotingRegressor
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Voting Regressor Performance:")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")


Voting Regressor Performance:
MAE: 0.07986545080620482
MSE: 0.010258434000090801
RMSE: 0.10128392764940941
R^2: 0.9999934988843757
