In [None]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import pandas
import mlflow
import mlflow.sklearn
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

def run_experiment(model, X, y, tscv, experiment_name, model_name, params=None):
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run():
        mlflow.log_param("model", model_name)
        if params:
            model.set_params(**params)
            mlflow.log_params(params)
        
        cv_scores = []
        for train_index, test_index in tscv.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            cv_scores.append((mse, r2))
        
        avg_mse = sum([score[0] for score in cv_scores]) / len(cv_scores)
        avg_r2 = sum([score[1] for score in cv_scores]) / len(cv_scores)
        
        mlflow.log_metric("avg_mse", avg_mse)
        mlflow.log_metric("avg_r2", avg_r2)
        
        mlflow.sklearn.log_model(model, "model")

mlflow.set_experiment("Coffee Shop Transaction Prediction")

In [None]:
data = pandas.read_csv("data/processed/hourly_data.csv")
target = "next_hour_transactions"

all_features = ['hour',
 'day_of_week',
 'is_weekend',
 'Americano_avg_3h',
 'Americano_avg_6h',
 'Americano_avg_12h',
 'Americano with Milk_avg_3h',
 'Americano with Milk_avg_6h',
 'Americano with Milk_avg_12h',
 'Cappuccino_avg_3h',
 'Cappuccino_avg_6h',
 'Cappuccino_avg_12h',
 'Cocoa_avg_3h',
 'Cocoa_avg_6h',
 'Cocoa_avg_12h',
 'Cortado_avg_3h',
 'Cortado_avg_6h',
 'Cortado_avg_12h',
 'Espresso_avg_3h',
 'Espresso_avg_6h',
 'Espresso_avg_12h',
 'Hot Chocolate_avg_3h',
 'Hot Chocolate_avg_6h',
 'Hot Chocolate_avg_12h',
 'Latte_avg_3h',
 'Latte_avg_6h',
 'Latte_avg_12h',
 'total_coffee_3h',
 'total_coffee_6h',
 'total_coffee_12h',
 'prev_money',
 'prev_cash_type',
 'coffee_entropy']

X = data[all_features]
y = data[target]

hourly_data = data.sort_index()
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=24)

time_features = ['hour', 'day_of_week', 'is_weekend']
coffee_avg_features = [f for f in all_features if '_avg_' in f]
total_coffee_features = [f for f in all_features if 'total_coffee_' in f]
prev_transaction_features = ['prev_money', 'prev_cash_type']
entropy_feature = ['coffee_entropy']
coffee_features = [col for col in X.columns if 'avg' in col or 'total_coffee' in col or 'entropy' in col]

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "XGBoost": XGBRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}

In [3]:
feature_sets = {
    "All Features": all_features,
    "Time Features": time_features,
    "Coffee Averages": coffee_avg_features,
    "Total Coffee": total_coffee_features,
    "Previous Transaction": prev_transaction_features,
    "Time + Coffee Averages": time_features + coffee_avg_features,
    "Time + Total Coffee": time_features + total_coffee_features,
    "Time + Previous Transaction": time_features + prev_transaction_features,
    "All + Entropy": all_features + entropy_feature
}

results = []

for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    for model_name, model in models.items():
        for feature_set_name, features in feature_sets.items():
            mse, mae, r2 = run_experiment(model, X_train[features], X_test[features], y_train, y_test, feature_set_name)
            results.append({
                "Fold": fold,
                "Model": model_name,
                "Feature Set": feature_set_name,
                "MSE": mse,
                "MAE": mae,
                "R2": r2
            })

results_df = pd.DataFrame(results)

2024/07/15 21:12:43 INFO mlflow.tracking.fluent: Experiment with name 'Coffee Shop Predictions' does not exist. Creating a new experiment.


In [4]:
# Feature selection
selector = SelectKBest(f_regression, k=10)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()

run_experiment(LinearRegression(), X[selected_features], y, tscv, "Coffee Shop Predictions", "Linear Regression - Selected Features")

# XGBoost with hyperparameter tuning
xgb_params = {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.1}
run_experiment(XGBRegressor(), X, y, tscv, "Coffee Shop Predictions", "XGBoost - Tuned", xgb_params)

