In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load data
data = pd.read_csv('new_merged_cleaned_immoscout24.csv', delimiter=';')

# Separate the target (Price) and features
X = data.drop('Price', axis=1)
y = data['Price']

# Identify categorical and numeric features
categorical_features = ['Canton']
numeric_features = ['Rooms', 'Living Space (sqm)']

# Preprocessor to handle categorical and numeric data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001, random_state=42)

# Create a pipeline with preprocessing and a random forest regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Fit the model on training data
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
error = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {error}')


Mean Absolute Error: 270494.5041411638


With a Mean Absolute Error (MAE) of 440,356.68 Swiss Francs in the context of predicting real estate prices, the interpretation of the results requires careful consideration due to the substantial size of the error relative to the expected values of property prices, even in a high-value market like Switzerland. This high MAE suggests significant discrepancies between the predicted values and the actual market prices. Here’s a detailed analysis suitable for a master's level understanding:

1. Assessment of MAE
The MAE being approximately 440,356 CHF is substantial, indicating that, on average, the predictions deviate from the actual sale prices by nearly half a million Swiss Francs. This level of error could be problematic for practical applications such as investment planning, loan issuance, and real estate valuation.

In [10]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming 'data' is already loaded and preprocessor setup is available
# Example setup for preprocessor (modify according to your actual feature categories)
categorical_features = ['Canton']  # example categorical features
numeric_features = ['Rooms', 'Living Space (sqm)']  # example numeric features

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Sample data preparation (ensure you replace this with your actual data)
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline with RandomForestRegressor
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Define a grid of parameters to search
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Predict using the best model
y_pred = best_model.predict(X_test)

# Calculate performance metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'Best Model RMSE: {rmse}')
print(f'Best Model R^2 Score: {r2}')


Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=100; total time=   5.3s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=100; total time=   5.3s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=100; total time=   5.0s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=200; total time=  10.4s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=200; total time=  10.5s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=200; total time=  10.9s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=2, regressor__n_estimators=100; total time=   4.3s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=2, regressor__n_estimators=100; total time=   4.2s
[CV] END regressor_

Analysis of the Results
Root Mean Squared Error (RMSE) of 674,346: This still indicates a sizable average error in predictions, but it's much reduced from the previous RMSE values. It's important to contextualize this value within the range and scale of house prices in your dataset. If house prices typically range in the millions, this RMSE might be more acceptable.
R^2 Score of 0.691: This score has improved markedly from earlier models, suggesting that the model is capturing a significant portion of the variability in the house prices. An R^2 Score closer to 1.0 is ideal, but 0.691 is a robust score, particularly for real-world data which can be noisy and unpredictable.

In [19]:
from xgboost import XGBRegressor

# Define the pipeline with XGBRegressor
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_regressor', XGBRegressor(objective='reg:squarederror', n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42))
])

# Fit the model
xgb_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_pipeline.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f'XGB Model RMSE: {rmse_xgb}')
print(f'XGB Model R^2 Score: {r2_xgb}')


ModuleNotFoundError: No module named 'xgboost'