In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor, BaggingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
abalone = fetch_ucirepo(id=1) 
  
# data (as pandas dataframes) 
X = abalone.data.features 
y = abalone.data.targets 

In [35]:
# Bagging Regressor
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("bagging", BaggingRegressor(n_jobs= -1))
    ]
)

parameters = {
    "bagging__estimator": [DecisionTreeRegressor(), KNeighborsRegressor(), RandomForestRegressor()],
    "bagging__n_estimators": [10, 25, 50, 100, 150]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='r2')
gscv_fitted = gscv.fit(X, y["Rings"])
test_scores = gscv_fitted.cv_results_["mean_test_score"]



In [36]:
gscv_fitted.best_estimator_

In [40]:
print(f"Average R^2 for Bagged Decission Tree Regressor: {np.mean(test_scores[0:5])}")

Average R^2 for Bagged Decission Tree Regressor: 0.42626936398517357


In [44]:
print(f"Average R^2 for Bagged KNeighbors Regressor: {np.mean(test_scores[5:10])}")

Average R^2 for Bagged KNeighbors Regressor: 0.41934320776533324


In [42]:
print(f"Average R^2 for Bagged Random Forest Regressor: {np.mean(test_scores[10:15])}")

Average R^2 for Bagged Random Forest Regressor: 0.4726450739531696


In [24]:
# Bagging Classifier
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("bagging", BaggingClassifier(n_jobs= -1, verbose = 1))
    ]
)

parameters = {
    "bagging__estimator": [DecisionTreeClassifier(), KNeighborsClassifier(n_jobs=-1), RandomForestClassifier(n_jobs=-1)],
    "bagging__n_estimators": [10, 25, 50, 100, 150]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='accuracy')
gscv_fitted = gscv.fit(X, y["Rings"])
test_scores = gscv_fitted.cv_results_["mean_test_score"]

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.7s remaining:    2.9s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.8s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parall

In [25]:
gscv_fitted.best_estimator_

In [30]:
print(f"Average Accuracy for Bagged Decission Tree Classifier: {np.mean(test_scores[0:5])}")

Average Accuracy for Bagged Decission Tree Classifier: 0.2369709194052087


In [33]:
print(f"Average Accuracy for Bagged KNeighbors Classifier: {np.mean(test_scores[5:10])}")

Average Accuracy for Bagged KNeighbors Classifier: 0.2220286508323067


In [34]:
print(f"Average Accuracy for Bagged Random Forest Classifier: {np.mean(test_scores[10:15])}")

Average Accuracy for Bagged Random Forest Classifier: 0.26024135461135145


In [45]:
# Random Forest Regressor
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("forest", RandomForestRegressor(n_jobs= -1))
    ]
)

parameters = {
    "forest__n_estimators": [10, 25, 50, 100, 150],
    "forest__min_samples_split": [2, 3, 4, 5, 10],
    "forest__min_samples_leaf": [1, 2, 3, 4, 5, 10]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='r2')
gscv_fitted = gscv.fit(X, y["Rings"])
test_scores = gscv_fitted.cv_results_["mean_test_score"]

In [46]:
gscv_fitted.best_estimator_

In [47]:
print(f"Average R^2 for Random Forest Regressor {np.mean(test_scores)}")

Average R^2 for Random Forest Regressor 0.4547783835227898


In [48]:
# Random Forest Classifier
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("forest", RandomForestClassifier(n_jobs= -1))
    ]
)

parameters = {
    "forest__n_estimators": [10, 25, 50, 100, 150],
    "forest__min_samples_split": [2, 3, 4, 5, 10],
    "forest__min_samples_leaf": [1, 2, 3, 4, 5, 10]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='r2')
gscv_fitted = gscv.fit(X, y["Rings"])
test_scores = gscv_fitted.cv_results_["mean_test_score"]



In [49]:
gscv_fitted.best_estimator_

In [51]:
print(f"Average R^2 for Random Forest Classifier {np.mean(test_scores)}")

Average R^2 for Random Forest Classifier 0.378032225621694
