# Metrics (metrics.py)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

def get_metrics(y_true, y_pred):
    metrics = {
        'r2': r2_score(y_true, y_pred),
        'mae': mean_absolute_error(y_true, y_pred),
        'mse': mean_squared_error(y_true, y_pred),
        'mape': mean_absolute_percentage_error(y_true, y_pred)
    }
    return metrics

# Model Selector (model_selector.py)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, train_test_split

class ModelSelector:
    def __init__(self, models):
        self.models = models

    def find_best_regressor(self, X_train, y_train, X_test, y_test):
        best_model = None
        best_score = -np.inf
        results = []

        for name, model in self.models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            test_r2 = get_metrics(y_test, y_pred)['r2']
            results.append((name, test_r2))

            print(f"Model: {name.upper()}")
            print(f"Test R-Squared Score: {test_r2:.5f}\n")

            # 10-fold cross-validation
            scores = cross_validate(model, X_train, y_train,
                                    scoring=['r2', 'neg_mean_absolute_error',
                                             'neg_mean_squared_error', 'neg_mean_absolute_percentage_error'],
                                    cv=10, return_train_score=False)

            scores_df = pd.DataFrame(scores, index=range(1, 11))
            avg_scores = scores_df.mean().abs().apply("{:.5f}".format)
            print(avg_scores)
            print("\n############################################################################\n")

            # Save best model based on R-Squared
            if test_r2 > best_score:
                best_score = test_r2
                best_model = (name, model)

        print(f"Best Regressor: {best_model[0]} with R-Squared: {best_score:.5f}")
        return best_model[0], best_model[1]

    def find_best_classifier(self, X_train, y_train, X_test, y_test):
        pass

# Regressions (regression_models.py)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

def get_regressors():
    return {
        "Linear Regression": LinearRegression(),
        "Ridge Regression": Ridge(),
        "Lasso Regression": Lasso(),
        "Elastic Net Regression": ElasticNet(),
        "Random Forest Regression": RandomForestRegressor(),
        "Gradient Boosting Regression": GradientBoostingRegressor(),
        "Bayesian Regression": BayesianRidge(),
        "SVR": SVR(),
        "Decision Tree Regression": DecisionTreeRegressor(),
        "KNN Regression": KNeighborsRegressor(),
        "XGBRegressor": XGBRegressor()
    }

# Classifications (classification_models.py)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

def get_classifiers():
    return {
        "Logistic Regression": LogisticRegression(),
        "Random Forest Classifier": RandomForestClassifier(),
        "Gradient Boosting Classifier": GradientBoostingClassifier(),
        "Decision Tree Classifier": DecisionTreeClassifier(),
        "KNN Classifier": KNeighborsClassifier(),
        "SVC": SVC(),
        "XGBoost Classifier": XGBClassifier()
    }

# Test Dataset

In [None]:
from google.colab import files
uploaded = files.upload()

Saving starbucks.csv to starbucks.csv


In [None]:
df = pd.read_csv("starbucks.csv")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,item,calories,fat,carb,fiber,protein,type
0,1,8-Grain Roll,350,8.0,67,5,10,bakery
1,2,Apple Bran Muffin,350,9.0,64,7,6,bakery
2,3,Apple Fritter,420,20.0,59,0,5,bakery
3,4,Banana Nut Loaf,490,19.0,75,4,7,bakery
4,5,Birthday Cake Mini Doughnut,130,6.0,17,0,0,bakery


In [None]:
df.columns

Index(['Unnamed: 0', 'item', 'calories', 'fat', 'carb', 'fiber', 'protein',
       'type'],
      dtype='object')

In [None]:
X = df[['fat', 'protein', 'fiber']]
y = df['calories']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

X_train shape: (61, 3)
y_train shape: (61,)
X_test shape: (16, 3)
y_test shape: (16,)


# Regression Tests

In [None]:
regressors = get_regressors()
selector = ModelSelector(regressors)
best_regressor, best_model = selector.find_best_regressor(X_train, y_train, X_test, y_test)

Model: LINEAR REGRESSION
Test R-Squared Score: 0.65305

fit_time                                      0.00261
score_time                                    0.00313
test_r2                                       0.51493
test_neg_mean_absolute_error                 51.29464
test_neg_mean_squared_error                3598.01870
test_neg_mean_absolute_percentage_error       0.18097
dtype: object

############################################################################

Model: RIDGE REGRESSION
Test R-Squared Score: 0.65288

fit_time                                      0.00288
score_time                                    0.00323
test_r2                                       0.51506
test_neg_mean_absolute_error                 51.29545
test_neg_mean_squared_error                3597.16265
test_neg_mean_absolute_percentage_error       0.18100
dtype: object

############################################################################

Model: LASSO REGRESSION
Test R-Squared Score: 0.65184



# Classification Tests

In [None]:
classifiers = get_classifiers()
best_classifier, best_model = selector.find_best_classifier(X_train, y_train, X_test, y_test)