# Bagged Decision Tree

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Initialize BaggingClassifier with DecisionTree as base estimator
model = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=30,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=data.target_names)

print(f"Accuracy (Bagged Decision Tree): {accuracy * 100:.2f}%")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)


Accuracy (Bagged Decision Tree): 95.32%

Confusion Matrix:
 [[ 59   4]
 [  4 104]]

Classification Report:
               precision    recall  f1-score   support

   malignant       0.94      0.94      0.94        63
      benign       0.96      0.96      0.96       108

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171



# Random Forest

In [2]:
# Import required libraries
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the Breast Cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Initialize the Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,        # number of trees
    max_depth=None,          # nodes are expanded until all leaves are pure
    random_state=42
)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
print(f"Accuracy (Random Forest): {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=data.target_names))



Accuracy (Random Forest): 97.08%

Confusion Matrix:
 [[ 59   4]
 [  1 107]]

Classification Report:
               precision    recall  f1-score   support

   malignant       0.98      0.94      0.96        63
      benign       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171



# Bagged K-Nearest Neighbors (KNN)

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Create a BaggingClassifier using KNeighborsClassifier as base estimator
model = BaggingClassifier(
    estimator=KNeighborsClassifier(n_neighbors=5),
    n_estimators=50,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (Bagged KNN): {accuracy * 100:.2f}%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=data.target_names))


Accuracy (Bagged KNN): 96.49%

Confusion Matrix:
 [[ 59   4]
 [  2 106]]

Classification Report:
               precision    recall  f1-score   support

   malignant       0.97      0.94      0.95        63
      benign       0.96      0.98      0.97       108

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



# AdaBoost (Adaptive Boosting)

In [4]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Create an AdaBoost model using Decision Tree (depth=1) as base learner
model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (AdaBoost): {accuracy * 100:.2f}%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=data.target_names))




Accuracy (AdaBoost): 97.66%

Confusion Matrix:
 [[ 61   2]
 [  2 106]]

Classification Report:
               precision    recall  f1-score   support

   malignant       0.97      0.97      0.97        63
      benign       0.98      0.98      0.98       108

    accuracy                           0.98       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.98      0.98      0.98       171



# Stochastic Gradient Boosting (Gradient Boosted Trees)

In [5]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# Create a Gradient Boosting model
model = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,  # Adds stochasticity
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy (Gradient Boosting): {accuracy * 100:.2f}%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=data.target_names))



Accuracy (Gradient Boosting): 95.91%

Confusion Matrix:
 [[ 60   3]
 [  4 104]]

Classification Report:
               precision    recall  f1-score   support

   malignant       0.94      0.95      0.94        63
      benign       0.97      0.96      0.97       108

    accuracy                           0.96       171
   macro avg       0.95      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



# Voting Ensemble Method

In [6]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define base classifiers
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
gboost = GradientBoostingClassifier(n_estimators=50, random_state=42)
logreg = LogisticRegression(max_iter=1000)

# Create Voting Classifier using soft voting
voting_clf = VotingClassifier(estimators=[
    ('adaboost', adaboost),
    ('gboost', gboost),
    ('logreg', logreg)
], voting='hard')  # Change to 'soft' for soft voting

# Train the Voting Classifier
voting_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy (Voting with Boosting Models): {accuracy * 100:.2f}%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=data.target_names))




Accuracy (Voting with Boosting Models): 97.08%

Confusion Matrix:
 [[ 60   3]
 [  2 106]]

Classification Report:
               precision    recall  f1-score   support

   malignant       0.97      0.95      0.96        63
      benign       0.97      0.98      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Grid Search for Hyperparameter Tuning

In [7]:
# import necessary libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Load dataset from UCI Machine Learning Repository
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv(url, names=columns)

# Prepare features and target
X = data.iloc[:, 0:8]
y = data.iloc[:, 8]

# Define alpha values to search
alphas = np.array([1, 0.1, 0.01, 0.001, 0.0001])
param_grid = {'alpha': alphas}

# Grid Search with Ridge Regression
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid.fit(X, y)

# Print best results
print(f"Best Score from Grid Search: {grid.best_score_:.4f}")
print(f"Best Alpha: {grid.best_estimator_.alpha}")



  from pandas.core import (


Best Score from Grid Search: 0.2761
Best Alpha: 1.0


# Randomized Search for Hyperparameter Tuning

In [8]:
# import required libraries
import numpy as np
from pandas import read_csv
from scipy.stats import uniform
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV

# Load the dataset directly from online
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
column_names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(url, names=column_names)

# Prepare the data
X = data.iloc[:, 0:8].values   # Features
Y = data.iloc[:, 8].values     # Target

# Define the parameter distribution
param_distributions = {'alpha': uniform(loc=0, scale=1)}  # Search alpha in range [0, 1]

# Initialize Ridge Regression model and RandomizedSearchCV
model = Ridge()
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=50,             # Number of random combinations to try
    random_state=42,
    cv=5                   # 5-fold cross-validation
)

# Fit the model with training data
random_search.fit(X, Y)

# Print the best results
print("Best Cross-Validation Score: {:.4f}".format(random_search.best_score_))
print("Best Alpha Value: {:.4f}".format(random_search.best_estimator_.alpha))



Best Cross-Validation Score: 0.2761
Best Alpha Value: 0.9699
