In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
data = pd.read_csv('filtered_immoscout24.csv', delimiter=';')

# Select features and the new target
features = ['Rooms', 'Living Space (sqm)', 'Nearest Station Distance (m)', 'Canton']
target = 'Price'  # Adjusting the target to 'Price'

X = data[features]
y = data[target]

# Define preprocessing for categorical data
categorical_features = ['Canton']
numeric_features = ['Rooms', 'Living Space (sqm)', 'Nearest Station Distance (m)']

one_hot = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot, categorical_features),
        ('num', 'passthrough', numeric_features)
    ])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Setup the regression pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = model_pipeline.predict(X_test)

# Calculate performance metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'Root Mean Squared Error: {rmse}')
print(f'R^2 Score: {r2}')


Root Mean Squared Error: 895117.4219185761
R^2 Score: 0.45701817919119037


Understanding the Metrics
Root Mean Squared Error (RMSE) of 898,415: This is a very high value, suggesting that on average, the predictions of the model are off by nearly 898,415 units from the actual prices. This level of error can be considered large, but it's important to compare this against the range and average of the actual prices in your dataset to better contextualize it.
R^2 Score of 0.453: This score indicates that approximately 45.3% of the variance in the Price is explained by the model. While not extremely low, this score suggests there is significant room for improvement, as more than half of the variance is still unaccounted for by the model.

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the pipeline with RandomForestRegressor
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Define a grid of parameters to search
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

# Predict using the best model
y_pred = best_model.predict(X_test)

# Calculate performance metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'Best Model RMSE: {rmse}')
print(f'Best Model R^2 Score: {r2}')


Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=100; total time=   2.6s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=100; total time=   3.5s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=100; total time=   2.6s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=200; total time=   7.4s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=200; total time=   8.8s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=1, regressor__n_estimators=200; total time=   2.3s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=2, regressor__n_estimators=100; total time=   0.9s
[CV] END regressor__max_depth=None, regressor__min_samples_leaf=2, regressor__n_estimators=100; total time=   0.7s
[CV] END regressor_

Analysis of the Results
Root Mean Squared Error (RMSE) of 674,346: This still indicates a sizable average error in predictions, but it's much reduced from the previous RMSE values. It's important to contextualize this value within the range and scale of house prices in your dataset. If house prices typically range in the millions, this RMSE might be more acceptable.
R^2 Score of 0.691: This score has improved markedly from earlier models, suggesting that the model is capturing a significant portion of the variability in the house prices. An R^2 Score closer to 1.0 is ideal, but 0.691 is a robust score, particularly for real-world data which can be noisy and unpredictable.

In [19]:
from xgboost import XGBRegressor

# Define the pipeline with XGBRegressor
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb_regressor', XGBRegressor(objective='reg:squarederror', n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42))
])

# Fit the model
xgb_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_pipeline.predict(X_test)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f'XGB Model RMSE: {rmse_xgb}')
print(f'XGB Model R^2 Score: {r2_xgb}')


ModuleNotFoundError: No module named 'xgboost'