In [7]:
import os
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import pandas
import mlflow
import mlflow.sklearn
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

def run_experiment(model, X_train, X_test, y_train, y_test, experiment_name, model_name, fold, params=None):
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run():
        mlflow.log_param("model", model_name)
        mlflow.log_param("fold", fold)
        if params:
            model.set_params(**params)
            mlflow.log_params(params)
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse = root_mean_squared_error(y_test, y_pred)
        
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("rmse", rmse)
        
        mlflow.sklearn.log_model(model, f"model_fold_{fold}")
    
    return mse, r2

os.environ["MLFLOW_TRACKING_URI"] = "file:../src/mlruns"
mlflow.set_experiment("Coffee Shop Transaction Prediction")

2024/07/16 21:02:25 INFO mlflow.tracking.fluent: Experiment with name 'Coffee Shop Transaction Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///workspaces/CoffeeSales/notebook/../src/mlruns/967960832031696142', creation_time=1721163745869, experiment_id='967960832031696142', last_update_time=1721163745869, lifecycle_stage='active', name='Coffee Shop Transaction Prediction', tags={}>

In [8]:
data = pandas.read_csv("../data/processed/hourly_data.csv")
target = "next_hour_transactions"

all_features = ['hour',
 'day_of_week',
 'is_weekend',
 'Americano_avg_3h',
 'Americano_avg_6h',
 'Americano_avg_12h',
 'Americano with Milk_avg_3h',
 'Americano with Milk_avg_6h',
 'Americano with Milk_avg_12h',
 'Cappuccino_avg_3h',
 'Cappuccino_avg_6h',
 'Cappuccino_avg_12h',
 'Cocoa_avg_3h',
 'Cocoa_avg_6h',
 'Cocoa_avg_12h',
 'Cortado_avg_3h',
 'Cortado_avg_6h',
 'Cortado_avg_12h',
 'Espresso_avg_3h',
 'Espresso_avg_6h',
 'Espresso_avg_12h',
 'Hot Chocolate_avg_3h',
 'Hot Chocolate_avg_6h',
 'Hot Chocolate_avg_12h',
 'Latte_avg_3h',
 'Latte_avg_6h',
 'Latte_avg_12h',
 'total_coffee_3h',
 'total_coffee_6h',
 'total_coffee_12h',
 'prev_money',
 'prev_cash_type',
 'coffee_entropy']

X = data[all_features]
y = data[target]

hourly_data = data.sort_index()
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=24)

time_features = ['hour', 'day_of_week', 'is_weekend']
coffee_avg_features = [f for f in all_features if '_avg_' in f]
total_coffee_features = [f for f in all_features if 'total_coffee_' in f]
prev_transaction_features = ['prev_money', 'prev_cash_type']
entropy_feature = ['coffee_entropy']
coffee_features = [col for col in X.columns if 'avg' in col or 'total_coffee' in col or 'entropy' in col]

In [9]:
models = {
    "Linear Regression": LinearRegression(),
    "XGBoost": XGBRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
    
}

feature_sets = {
    "All Features": all_features,
    "Time Features": time_features,
    "Coffee Averages": coffee_avg_features,
    "Total Coffee": total_coffee_features,
    "Previous Transaction": prev_transaction_features,
    "Time + Coffee Averages": time_features + coffee_avg_features,
    "Time + Total Coffee": time_features + total_coffee_features,
    "Time + Previous Transaction": time_features + prev_transaction_features,
}

## First Experiment

In [10]:

for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    for model_name, model in models.items():
        for feature_set_name, features in feature_sets.items():
            mse, r2 = run_experiment(
                model, 
                X_train[features], 
                X_test[features], 
                y_train, 
                y_test, 
                "Coffee Shop Transaction Prediction", 
                f"{model_name}_{feature_set_name}", 
                fold
            )

## Second Experiment

In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "XGBoost": XGBRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
    
}

for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

     # Feature selection within the fold
    selector = SelectKBest(f_regression, k=10)
    selector.fit(X_train, y_train)
    selected_features = X_train.columns[selector.get_support()].tolist()
    
    # Define feature sets
    feature_sets = {
        "All Features": X_train.columns.tolist(),
        "Selected Features": selected_features
    }


    for model_name, model in models.items():

        
        for feature_set_name, features in feature_sets.items():
            mse, r2 = run_experiment(
                model, 
                X_train[features], 
                X_test[features], 
                y_train, 
                y_test, 
                "Coffee Shop Transaction Prediction", 
                f"{model_name}_{feature_set_name}", 
                fold
            )


## Third Experiment

In [12]:
xgb_params = {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}

models = {
    "Linear Regression": LinearRegression(),
    "XGBoost": XGBRegressor()
}

results = []
for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Feature selection within the fold
    selector = SelectKBest(f_regression, k=10)
    selector.fit(X_train, y_train)
    selected_features = X_train.columns[selector.get_support()].tolist()
    
    # Define feature sets
    feature_sets = {
        "All Features": X_train.columns.tolist(),
        "Selected Features": selected_features
    }
    
    for model_name, model in models.items():
        for feature_set_name, features in feature_sets.items():
            # Set parameters for XGBoost
            params = xgb_params if model_name == "XGBoost" else None
            
            mse, r2 = run_experiment(
                model, 
                X_train[features], 
                X_test[features], 
                y_train, 
                y_test, 
                "Coffee Shop Transaction Prediction", 
                f"{model_name} - {feature_set_name}", 
                fold,
                params
            )
            results.append({
                "Fold": fold,
                "Model": model_name,
                "Feature Set": feature_set_name,
                "MSE": mse,
                "R2": r2
            })
