In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_squared_log_error, root_mean_squared_error

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop columns that won't be used
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Correctly handle missing values without inplace=True
data['Rooms'] = data['Rooms'].fillna(data['Rooms'].median())
data['Living Space (sqm)'] = data['Living Space (sqm)'].fillna(data['Living Space (sqm)'].median())
data['Nearest Station Distance (m)'] = data['Nearest Station Distance (m)'].fillna(data['Nearest Station Distance (m)'].median())

# Convert categorical 'Canton' and 'Condition' to numeric using OneHotEncoder
categorical_features = ['Canton', 'condition']
numeric_features = ['Rooms', 'Living Space (sqm)', 'Nearest Station Distance (m)', 'city_center', 'garden', 'terrace', 'view', 'luxus']

# Create a Column Transformer to handle the preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Separate features and target variable
X = data.drop('Price', axis=1)
y = data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model using root mean squared error directly
rmse = root_mean_squared_error(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')

# Get feature importances from the model
feature_importances = model.named_steps['regressor'].feature_importances_
# Correct method to get the names after transformation
encoded_features = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
features = numeric_features + list(encoded_features)

# Print feature importance
feature_importance_dict = dict(zip(features, feature_importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print("Feature Importances:")
for name, importance in sorted_features:
    print(f"{name}: {importance}")


Root Mean Squared Error: 402453.34764314984
Feature Importances:
Living Space (sqm): 0.3966270708717055
Nearest Station Distance (m): 0.2072482481197514
Canton_Geneva: 0.05450423177271926
Canton_Zurich: 0.03841888647470974
Canton_Jura: 0.027871856611456268
Rooms: 0.025605411681375884
Canton_Vaud: 0.025386318815425114
view: 0.016911000625505067
luxus: 0.015921392201184506
terrace: 0.0154199835136841
Canton_Graubuenden: 0.014999540251578616
Canton_Zug: 0.014575490569795619
garden: 0.013706512389831251
Canton_Lucerne: 0.0094502350551034
condition_new: 0.009190214487481072
Canton_Schwyz: 0.008895067707785925
Canton_Valais: 0.008888882531073745
city_center: 0.008709061614484708
Canton_Basel-Landschaft: 0.008303244169679945
condition_old: 0.008113819385105043
Canton_Basel-Stadt: 0.007779295833760928
Canton_Neuchatel: 0.007066309940448477
Canton_Ticino: 0.0066858251204269074
Canton_Solothurn: 0.006435564837955883
condition_renovated: 0.00615297168917211
Canton_Aargau: 0.005954963895703494
Can

In [12]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, root_mean_squared_error

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the XGBoost regressor within a pipeline
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300, learning_rate=0.05, max_depth=5)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
rmse = root_mean_squared_error(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')

# Feature importance (requires handling of feature names post OneHotEncoding)
encoder_features = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
features = numeric_features + list(encoder_features)
importances = pipeline.named_steps['regressor'].feature_importances_

# Print sorted feature importance
feature_importance_dict = dict(zip(features, importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
for name, importance in sorted_features:
    print(f"{name}: {importance}")


Root Mean Squared Error: 405203.8203927445
Canton_Geneva: 0.13125596940517426
Canton_Jura: 0.12096162140369415
Canton_Zurich: 0.09513138234615326
Canton_Vaud: 0.0690147802233696
Living Space (sqm): 0.058032602071762085
Canton_Solothurn: 0.05625741928815842
Canton_Zug: 0.05055336654186249
Canton_Valais: 0.048523418605327606
Canton_Neuchatel: 0.03652404993772507
Canton_Lucerne: 0.03407268226146698
Canton_Graubuenden: 0.03136114403605461
Canton_Basel-Stadt: 0.03088955581188202
Canton_Schwyz: 0.030306942760944366
Canton_Bern: 0.02420995756983757
Canton_Fribourg: 0.024197252467274666
Canton_Basel-Landschaft: 0.018652116879820824
Canton_Glarus: 0.017911778762936592
Canton_Obwalden: 0.01433262974023819
Canton_St-Gallen: 0.014281773939728737
condition_renovated: 0.012157050892710686
Canton_Aargau: 0.011234627105295658
Canton_Ticino: 0.010701412335038185
Nearest Station Distance (m): 0.008554065600037575
Canton_Nidwalden: 0.008292359299957752
Canton_Schaffhausen: 0.007698873057961464
Canton_App

In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error

# Assuming y_test are the actual values and y_pred are the model's predictions

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

# Calculate Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # or use root_mean_squared_error
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2:.2f}")

# Calculate Adjusted R-squared
n = len(y_test)  # number of data points
p = X_test.shape[1]  # number of predictors
adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print(f"Adjusted R-squared: {adj_r2:.2f}")

# Calculate Mean Squared Logarithmic Error (MSLE)
# Ensure no zero or negative values in predictions or actuals
msle = mean_squared_log_error(y_test, y_pred)
print(f"Mean Squared Logarithmic Error (MSLE): {msle:.2f}")


Mean Absolute Error (MAE): 292588.72
Mean Squared Error (MSE): 164190136060.88
Root Mean Squared Error (RMSE): 405203.82
R-squared: 0.60
Adjusted R-squared: 0.60
Mean Squared Logarithmic Error (MSLE): 0.10


