In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

In [2]:
# df_old_analysis = pd.read_csv(r"..\1_Data\df_analysis.csv")
df_original = pd.read_csv(r"..\1_Data\df_cleaned.csv")
df_analysis = df_original.copy()
df_ML = df_analysis.copy()

In [3]:
df_ML.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37583 entries, 0 to 37582
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   fullAddress            37583 non-null  object 
 1   postcode               37583 non-null  object 
 2   country                37583 non-null  object 
 3   outcode                37583 non-null  object 
 4   latitude               37583 non-null  float64
 5   longitude              37583 non-null  float64
 6   bathrooms              37583 non-null  float64
 7   bedrooms               37583 non-null  float64
 8   floorAreaSqM           37583 non-null  float64
 9   livingRooms            37583 non-null  float64
 10  tenure                 37583 non-null  object 
 11  propertyType           37583 non-null  object 
 12  currentEnergyRating    37583 non-null  object 
 13  soldYear               37583 non-null  int64  
 14  soldT                  37583 non-null  object 
 15  so

In [4]:
df_ML.describe()

Unnamed: 0,latitude,longitude,bathrooms,bedrooms,floorAreaSqM,livingRooms,soldYear,soldPrice,sqmPrice,in_conservation_area,sqm_approx
count,37583.0,37583.0,37583.0,37583.0,37583.0,37583.0,37583.0,37583.0,37583.0,37583.0,37583.0
mean,51.509197,-0.11734,1.485299,2.382833,100.520661,1.237368,2023.328473,811916.9,7616.738752,0.289972,100.511668
std,0.056477,0.089292,0.733059,1.2908,60.835119,0.611463,0.469664,921448.8,3867.971948,0.453755,60.871258
min,51.386653,-0.347055,1.0,1.0,10.0,0.0,2023.0,10000.0,66.23,0.0,10.0
25%,51.465565,-0.184005,1.0,1.0,59.0,1.0,2023.0,375000.0,5460.325,0.0,60.0
50%,51.501828,-0.120158,1.0,2.0,78.0,1.0,2023.0,530000.0,6854.84,0.0,80.0
75%,51.549781,-0.054122,2.0,3.0,129.0,1.0,2024.0,905000.0,8750.0,1.0,130.0
max,51.665454,0.138188,8.0,9.0,500.0,7.0,2024.0,22500000.0,68613.33,1.0,500.0


In [9]:
# A. Convert Rooms to Integers
room_cols = ['bathrooms', 'bedrooms', 'livingRooms']
df_ML[room_cols] = df_analysis[room_cols].astype(int)

# B. Label Encode Energy Rating (Ordinal: A=7, G=1, NotRated=0)
energy_map = {'A': 7, 'B': 6, 'C': 5, 'D': 4, 'E': 3, 'F': 2, 'G': 1, 'NotRated': np.nan}
df_ML['energy_encoded'] = df_ML['currentEnergyRating'].map(energy_map)

# C. One-Hot Encoding for categorical columns
# Drop the columns not needed
df_ML = df_ML.drop(columns=['soldT'])
df_ML = pd.get_dummies(df_ML, columns=['propertyType', 'tenure', 'construction_age_band'], drop_first=True, prefix=['prop', 'tenure', 'age'])

# D. Define features (X) and targets (y)
X = df_ML.drop(columns=['soldPrice']) 
y = np.log1p(df_ML['soldPrice'])

KeyError: "['soldT'] not found in axis"

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Target Encode Outcode (using median sqmPrice)
outcode_medians = X_train.groupby('outcode')['sqmPrice'].median()
global_median = X_train['sqmPrice'].median()
X_train['neighborhood_value'] = X_train['outcode'].map(outcode_medians)

X_test['neighborhood_value'] = X_test['outcode'].map(outcode_medians)
X_test['neighborhood_value'] = X_test['neighborhood_value'].fillna(global_median)

# E. Select final features (excluding the original strings and size_bucket)
final_features = [
    'neighborhood_value', 'bathrooms', 'bedrooms', 'floorAreaSqM', 
    'livingRooms', 'energy_encoded', 'tenure_Freehold', 'in_conservation_area', 'latitude', 'longitude'
] + [col for col in df_ML.columns if 'propertyType_' in col or 'age' in col.lower()]

X_train = X_train[final_features].astype(float)
X_test = X_test[final_features].astype(float)

In [8]:
# 1. Initialize XGBoost Regressor
# These parameters are "safe defaults" for regression
xg_reg = xgb.XGBRegressor(
    objective ='reg:squarederror', 
    n_estimators=500,        # More trees than Random Forest
    learning_rate=0.05,      # Learn slower but better
    max_depth=6,             # Tree depth
    n_jobs=-1,
    random_state=42
)

# 2. Train
print("Training XGBoost")
xg_reg.fit(X_train, y_train)

# 3. Predict
y_pred_log = xg_reg.predict(X_test)
# Inverse log transform
y_test = np.expm1(y_test)
y_pred = np.expm1(y_pred_log)

# 4. Calculate Metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("-" * 30)
print("XGBoost Results (Cleaned Data):")
print(f"R² Score: {r2:.4f}")
print(f"RMSE:     £{rmse:,.0f}")
print(f"MAE:      £{mae:,.0f}")
print(f"MAPE:     {mape:.2%}")
print("-" * 30)

Training XGBoost


  result = getattr(ufunc, method)(*inputs, **kwargs)


ValueError: Input contains infinity or a value too large for dtype('float64').