In [1]:
# pandas
import pandas as pd
from pandas import Series,DataFrame
from datetime import datetime as datetime
import re as re
# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
df = pd.read_csv('Bookings per stashpoint prediciton.csv')  
df['activated'] = pd.to_datetime(df['activated'] , errors='coerce')

# Filter to only include stashpoints in the UK (country == 'GBR')
df = df[df['country'] == 'GBR'].copy()

df.head()

Unnamed: 0,stashpoint_id,bookings_90d,business_name,country,type,capacity,activated,sps_within_500m,units_supply_within_500m,review_count_90d,avg_review_rating_90d,searches_last_90
3,21ee810bc764,2,Foodwise Express,GBR,convenience_store,50,2024-11-06,24,1355,0,,7958
5,6f169b288d15,3,Foodwise Local,GBR,convenience_store,10,2025-05-29,24,1395,0,,7953
9,08b673bc5584,113,Europa House Hotel,GBR,hotel,200,2023-07-26,23,1156,4,4.75,4388
11,5c806eaca40c,11,Sonic Paddington Souvenirs,GBR,luggage_storage_shop,100,2025-03-05,23,1255,0,,7925
13,b612231f182c,3,Mail Boxes Etc,GBR,postal_shop,50,2020-05-25,23,1305,0,,7946


In [3]:
print(df.columns.tolist()) 

['stashpoint_id', 'bookings_90d', 'business_name', 'country', 'type', 'capacity', 'activated', 'sps_within_500m', 'units_supply_within_500m', 'review_count_90d', 'avg_review_rating_90d', 'searches_last_90']


In [4]:
# list of unique locations type 
#print(df['type'].value_counts())
# Group some of them 

type_mapping = {
    # Hospitality
    'hotel': 'hospitality',
    'hostel': 'hospitality',
    'premier_inn_hotel': 'hospitality',
    'staycity_hotel': 'hospitality',
    'wilde_aparthotel': 'hospitality',
    'yha_hostel': 'hospitality',
    'marriott_courtyard_hotel': 'hospitality',
    'concordia_boutique_hotel': 'hospitality',
    'sofitel_hotel': 'hospitality',
    'ihg_crowne_plaza_hotel': 'hospitality',

    # Retail
    'shop': 'retail',
    'convenience_store': 'retail',
    'tech_shop': 'retail',
    'postal_shop': 'retail',
    'luggage_storage_shop': 'retail',
    'rental_shop': 'retail',
    'souvenir_shop': 'retail',
    'newsagent': 'retail',
    'smoke_shop': 'retail',

    # Food & Beverage
    'food_and_beverage_business': 'food',
    'restaurant': 'food',
    'cafe': 'food',

    # Services
    'office': 'services',
    'health_beauty_business': 'services',
    'laundry_business': 'services',

    # Utility
    'car_park': 'utility',

    # Locker
    'locker': 'locker'
}

# Apply grouping
df['type_grouped'] = df['type'].map(type_mapping).fillna('other')
print(df['type_grouped'].value_counts())

type_grouped
locker         952
retail         380
hospitality    378
services       120
food            99
utility          1
Name: count, dtype: int64


In [5]:
# One-hot encode 'type_grouped' and drop the original column
df = pd.get_dummies(df, columns=['type_grouped'], prefix='type', drop_first=True)

In [6]:
# Drop non needed 
df.drop(['country', 'type'], axis=1, inplace=True)

In [7]:
today = pd.Timestamp.today()
df['age_in_days'] = (today - df['activated']).dt.days


In [8]:
df.drop(['review_count_90d', 'avg_review_rating_90d'], axis=1, inplace=True)

In [9]:
import matplotlib.pyplot as plt

# Compute correlations with target
correlations = df.corr(numeric_only=True)['bookings_90d'].drop('bookings_90d').sort_values()
correlations

type_locker                -0.201632
type_services              -0.044562
type_utility                0.017603
type_retail                 0.038647
age_in_days                 0.213924
capacity                    0.215565
type_hospitality            0.261021
sps_within_500m             0.298497
units_supply_within_500m    0.334850
searches_last_90            0.488269
Name: bookings_90d, dtype: float64

In [10]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Select features and target 
X = df.drop(columns=['stashpoint_id', 'bookings_90d', 'activated'], errors='ignore')
X = X.select_dtypes(include=['number'])  # Keep only numeric columns
y = df['bookings_90d']
# Standardize numeric features
X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

# Split into train/test (optional for now)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Fit Ridge regression
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

# Get coefficients
coefficients_ridge = pd.Series(model.coef_, index=X.columns).sort_values()
coefficients_ridge

units_supply_within_500m    -1.387524
sps_within_500m             -0.622808
age_in_days                  8.930572
capacity                    10.942213
searches_last_90            57.245503
dtype: float64

In [11]:
# Linear regression 
# Remove the engineered variables
# Use y_log = np.log1p(bookings_90d) to handle skew in the target
# Keep raw features only, like searches_last_90, sps_within_500m, capacity, etc.

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# 1. Define target (log-transformed)
#y_log = np.log1p(df['bookings_90d'])
# Keep original target 
y = df['bookings_90d']

# 2. Define features — drop engineered vars
X = df.drop(columns=[
    'stashpoint_id', 'bookings_90d', 'activated',
    'competition_density', 'search_to_supply_ratio'
], errors='ignore')

X = X.select_dtypes(include='number')  # Keep only numeric columns

# 3. Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_log, test_size=0.2, random_state=42)

# 5. Fit linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# 6. Predict and reverse log
y_pred_log = lr.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

# 7. Evaluate
r2 = r2_score(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))

print(f'Linear Regression (no engineered vars, not-log target) R²: {r2:.3f}')
print(f'Linear Regression (no engineered vars, not-log  target) RMSE: {rmse:.2f}')

Linear Regression (no engineered vars, not-log target) R²: 0.028
Linear Regression (no engineered vars, not-log  target) RMSE: 105.70


In [13]:
### LINEAR REGRESSION WITH LOG(Y) 
#y_log = np.log1p(df['bookings_90d'])
#Linear Regression (no engineered vars, log target) R²: 0.028
#Linear Regression (no engineered vars, log target) RMSE: 105.70

#Underfits the data.
#Likely misses critical nonlinearities and interactions.
#Suffers from removing features that carried strong signal (search_to_supply_ratio, etc.).
#Removed engineered variables	Lost meaningful transformations of raw data.
#Pure linear model	Can't model non-linear relationships (e.g., saturation, thresholds).
#Log target + linear fit	Doesn't help much if features themselves need transformation.

### LINEAR REGRESSION WITH Y
#R²: 0.028 → The model explains only ~2.8% of the variance.
#RMSE: 105.70 → Worse than Random Forest (RMSE ~80), meaning high prediction error.

#Nonlinear relationships (e.g., searches or age have diminishing/multiplicative effects).
#Interactions that linear models can't naturally capture.


## ABORT LINEAR REG altogether 

