In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import catboost as cat
import numpy as np
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("../datasets/coffee_shop_revenue.csv")

target = "Daily_Revenue"
features = list(df.columns)
features.remove(target)

# Simple Linear Regression

In [21]:
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df[features])
scaled_df = pd.DataFrame(scaled_df, columns=features)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    scaled_df[features], df[target], test_size=0.33, random_state=2025)

In [23]:
simple_model = LinearRegression()
simple_model.fit(X_train, y_train)

In [24]:
preds_train = simple_model.predict(X_train)
preds_test = simple_model.predict(X_test)

In [25]:
def get_metrics(y_true, y_pred, type):
    r2 = metrics.r2_score(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mape = metrics.mean_absolute_percentage_error(y_true, y_pred)

    print(f"Type Dataset - {type}")
    print(f"R2 = {r2}")
    print(f"MSE = {mse}")
    print(f"MAE = {mae}")
    print(f"MAPE = {mape}")

In [26]:
get_metrics(y_train, preds_train, type='train')

Type Dataset - train
R2 = 0.8923393043201814
MSE = 103323.18455334942
MAE = 252.39268818059753
MAPE = 0.19664217633836406


In [27]:
get_metrics(y_test, preds_test, type='test')

Type Dataset - test
R2 = 0.8893558797216421
MSE = 103764.34304032168
MAE = 254.2171406250317
MAPE = 0.19453890446431574


# Combine with classifier

## starts

In [31]:
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df[features])
scaled_df = pd.DataFrame(scaled_df, columns=features)
scaled_df = pd.concat([scaled_df, df[target]], axis=1)
scaled_df['class'] = scaled_df[target].apply(lambda x: 1 if x >= 2_000 else 0)

In [47]:
scaled_df['class'].value_counts()

class
0    1189
1     811
Name: count, dtype: int64

In [33]:
clf_target = 'class'

In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    scaled_df[features], scaled_df[[target, clf_target]], test_size=0.33, random_state=2025)

df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

## fit classifier

In [45]:
clf = RandomForestClassifier()
clf.fit(df_train[features], df_train[clf_target])

clf_pred_train = clf.predict(df_train[features])
clf_pred_test = clf.predict(df_test[features])

In [49]:
print(metrics.classification_report(df_train[clf_target], clf_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       799
           1       1.00      1.00      1.00       541

    accuracy                           1.00      1340
   macro avg       1.00      1.00      1.00      1340
weighted avg       1.00      1.00      1.00      1340



In [50]:
print(metrics.classification_report(df_test[clf_target], clf_pred_test))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95       390
           1       0.94      0.90      0.92       270

    accuracy                           0.93       660
   macro avg       0.94      0.93      0.93       660
weighted avg       0.93      0.93      0.93       660



## fit regression

In [54]:
df_test

Unnamed: 0,Number_of_Customers_Per_Day,Average_Order_Value,Operating_Hours_Per_Day,Number_of_Employees,Marketing_Spend_Per_Day,Location_Foot_Traffic,Daily_Revenue,class,clf_pred
1746,0.222717,0.836000,0.818182,0.833333,0.871227,0.237092,1994.80,0,0
844,0.024499,0.496000,0.636364,0.916667,0.562293,0.663857,628.88,0,0
1520,0.251670,0.677333,0.454545,0.416667,0.668825,0.446786,1525.29,0,0
1829,0.238307,0.786667,0.727273,0.250000,0.449696,0.348788,1856.18,0,0
719,0.968820,0.177333,0.000000,0.750000,0.539357,0.239199,2259.50,1,0
...,...,...,...,...,...,...,...,...,...
83,0.688196,0.397333,0.000000,1.000000,0.416241,0.318230,1769.92,0,1
1223,0.710468,0.122667,0.272727,0.750000,0.019893,0.839831,1255.72,0,0
489,0.338530,0.665333,0.272727,0.833333,0.803542,0.572181,1888.87,0,0
731,0.884187,0.733333,0.727273,0.500000,0.405784,0.291886,3593.75,1,1


In [56]:
df_test[df_test['clf_pred'] == 0]

Unnamed: 0,Number_of_Customers_Per_Day,Average_Order_Value,Operating_Hours_Per_Day,Number_of_Employees,Marketing_Spend_Per_Day,Location_Foot_Traffic,Daily_Revenue,class,clf_pred
1746,0.222717,0.836000,0.818182,0.833333,0.871227,0.237092,1994.80,0,0
844,0.024499,0.496000,0.636364,0.916667,0.562293,0.663857,628.88,0,0
1520,0.251670,0.677333,0.454545,0.416667,0.668825,0.446786,1525.29,0,0
1829,0.238307,0.786667,0.727273,0.250000,0.449696,0.348788,1856.18,0,0
719,0.968820,0.177333,0.000000,0.750000,0.539357,0.239199,2259.50,1,0
...,...,...,...,...,...,...,...,...,...
1726,0.657016,0.353333,0.909091,1.000000,0.267840,0.012645,1582.05,0,0
103,0.815145,0.130667,0.545455,0.916667,0.534966,0.147524,1908.69,0,0
558,0.603563,0.432000,0.636364,0.750000,0.207426,0.751317,1553.05,0,0
1223,0.710468,0.122667,0.272727,0.750000,0.019893,0.839831,1255.72,0,0


In [58]:
df_train_0 = df_train[df_train[clf_target] == 0]
df_train_1 = df_train[df_train[clf_target] == 1]

clf_pred_test = clf.predict(df_test[features])
df_test['clf_pred'] = clf_pred_test

df_test_0 = df_test[df_test['clf_pred'] == 0]
df_test_1 = df_test[df_test['clf_pred'] == 1]

### lr for class 0

In [59]:
lr_0 = LinearRegression()
lr_0.fit(df_train_0[features], df_train_0[target])
pred_train_0 = lr_0.predict(df_train_0[features])
pred_test_0 = lr_0.predict(df_test_0[features])

In [60]:
def get_metrics(y_true, y_pred, type_dataset, type_clf_mark):
    r2 = metrics.r2_score(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mape = metrics.mean_absolute_percentage_error(y_true, y_pred)

    print(f"Type Dataset - {type_dataset}, class - {type_clf_mark}")
    print(f"R2 = {r2}")
    print(f"MSE = {mse}")
    print(f"MAE = {mae}")
    print(f"MAPE = {mape}")

In [61]:
get_metrics(df_train_0[target], pred_train_0, 'train', '0')

Type Dataset - train, class - 0
R2 = 0.708400690140345
MSE = 59495.691799538734
MAE = 196.71533304314073
MAPE = 0.21265927780142632


In [62]:
get_metrics(df_test_0[target], pred_test_0, 'test', '0')

Type Dataset - test, class - 0
R2 = 0.7195730453104919
MSE = 62471.767943789506
MAE = 201.11602696374263
MAPE = 0.1799941548320469


### lr for class 1

In [63]:
lr_1 = LinearRegression()
lr_1.fit(df_train_1[features], df_train_1[target])
pred_train_1 = lr_1.predict(df_train_1[features])
pred_test_1 = lr_1.predict(df_test_1[features])

In [64]:
get_metrics(df_train_1[target], pred_train_1, 'train', '1')

Type Dataset - train, class - 1
R2 = 0.8973319318699309
MSE = 48367.471627302606
MAE = 177.0375983305565
MAPE = 0.06491786753241505


In [65]:
get_metrics(df_test_1[target], pred_test_1, 'test', '1')

Type Dataset - test, class - 1
R2 = 0.8921076811646174
MSE = 49307.55680064364
MAE = 179.52245922956197
MAPE = 0.06744492985769658


### collect all

In [76]:
get_metrics(pd.concat([df_train_0[target], df_train_1[target]]), 
             np.hstack([pred_train_0, pred_train_1]), 
             'train', '0&1')

Type Dataset - train, class - 0&1
R2 = 0.9426880965110764
MSE = 55002.88052104638
MAE = 188.77081477485115
MAPE = 0.15301143977490758


In [77]:
get_metrics(pd.concat([df_test_0[target], df_test_1[target]]), 
             np.hstack([pred_test_0, pred_test_1]), 
             'test', '0&1')

Type Dataset - test, class - 0&1
R2 = 0.9388947011131245
MSE = 57305.81235882772
MAE = 192.64218750442024
MAPE = 0.13582711048605187


__ex results:__

Type Dataset - test
- R2 = 0.8893558797216421
- MSE = 103764.34304032168
- MAE = 254.21714062503173
- MAPE = 0.19453890446431574

# Catboost

In [36]:
cat_model = cat.CatBoostRegressor(random_seed = 2025)
cat_model.fit(df_train[features], y_train[target])

Learning rate set to 0.042881
0:	learn: 946.1464063	total: 1.25ms	remaining: 1.25s
1:	learn: 915.9680527	total: 2ms	remaining: 998ms
2:	learn: 887.1856476	total: 2.81ms	remaining: 934ms
3:	learn: 860.4618262	total: 3.71ms	remaining: 923ms
4:	learn: 835.8096332	total: 4.7ms	remaining: 936ms
5:	learn: 809.4329383	total: 5.58ms	remaining: 924ms
6:	learn: 783.2869672	total: 6.39ms	remaining: 907ms
7:	learn: 760.2760670	total: 7.25ms	remaining: 899ms
8:	learn: 737.0142101	total: 8.02ms	remaining: 883ms
9:	learn: 712.8596707	total: 8.71ms	remaining: 863ms
10:	learn: 691.7418662	total: 9.49ms	remaining: 853ms
11:	learn: 672.4406490	total: 11.1ms	remaining: 918ms
12:	learn: 651.9863868	total: 12.2ms	remaining: 928ms
13:	learn: 632.3544979	total: 13.4ms	remaining: 947ms
14:	learn: 614.3785062	total: 14.3ms	remaining: 942ms
15:	learn: 596.0390920	total: 15.4ms	remaining: 948ms
16:	learn: 579.6794335	total: 16.4ms	remaining: 948ms
17:	learn: 562.4597851	total: 17.3ms	remaining: 941ms
18:	learn: 5

<catboost.core.CatBoostRegressor at 0x150cfb8d0>

In [37]:
def get_metrics(y_true, y_pred, type_dataset):
    r2 = metrics.r2_score(y_true, y_pred)
    mse = metrics.mean_squared_error(y_true, y_pred)
    mae = metrics.mean_absolute_error(y_true, y_pred)
    mape = metrics.mean_absolute_percentage_error(y_true, y_pred)

    print(f"Type Dataset - {type_dataset}")
    print(f"R2 = {r2}")
    print(f"MSE = {mse}")
    print(f"MAE = {mae}")
    print(f"MAPE = {mape}")

In [39]:
y_test_pred = cat_model.predict(df_test[features])

In [40]:
get_metrics(df_test[target], y_test_pred, type_dataset='Catboost model')

Type Dataset - Catboost model
R2 = 0.9509853097971845
MSE = 45966.989618831605
MAE = 171.31919692367705
MAPE = 0.12432293487420971


In [None]:
pd.DataFrame.from_dict({
'R2': [0.8893558797216421, 0.9388947011131245, 0.9509853097971845],
'MSE': [103764.34304032168, 57305.81235882772, 45966.989618831605],
'MAE': [254.21714062503173, 192.64218750442024, 171.31919692367705],
'MAPE': [0.19453890446431574, 0.13582711048605187, 0.12432293487420971]
    }, orient='index', columns=['linear_reg', 'segm_linear_reg', 'catboost_reg'])

Unnamed: 0,linear_reg,segm_linear_reg,catboost_reg
R2,0.889356,0.938895,0.950985
MSE,103764.34304,57305.812359,45966.989619
MAE,254.217141,192.642188,171.319197
MAPE,0.194539,0.135827,0.124323
