In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import warnings

warnings.filterwarnings('ignore')

# Load data
df = pd.read_csv('NLP_enriched_immoscout24.csv')

# Select relevant features and target
features = df[['Title', 'Rooms', 'Living Space (sqm)', 'canton', 'Distance from nearest station (m)', 'city_center', 'garden', 'terrace', 'view', 'luxus', 'condition']]
target = df['Price']

# Preprocessing pipelines for both numeric and categorical data
numeric_features = ['Rooms', 'Living Space (sqm)', 'Distance from nearest station (m)']
categorical_features = ['Title', 'canton', 'city_center', 'garden', 'terrace', 'view', 'luxus', 'condition']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create a pipeline that preprocesses the data and then applies a regressor
def create_pipeline(regressor):
    return Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])

# List of models to train
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42)
}

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train and evaluate models
results = {}
for name, model in models.items():
    pipeline = create_pipeline(model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'RMSE': rmse, 'R²': r2}
    print(f'{name} - RMSE: {rmse}, R²: {r2}')

# Find the best model
best_model_name = min(results, key=lambda k: results[k]['RMSE'])
best_model = models[best_model_name]
print(f'Best model: {best_model_name}')

# Optionally, perform hyperparameter tuning on the best model
param_grid = {}
if best_model_name == 'Random Forest':
    param_grid = {
        'regressor__n_estimators': [100, 200],
        'regressor__max_depth': [10, 20, None]
    }
elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 7]
    }
elif best_model_name == 'XGBoost':
    param_grid = {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 7]
    }

if param_grid:
    grid_search = GridSearchCV(create_pipeline(best_model), param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    print(f'Best parameters for {best_model_name}: {grid_search.best_params_}')
    best_pipeline = grid_search.best_estimator_
else:
    best_pipeline = create_pipeline(best_model)
    best_pipeline.fit(X_train, y_train)

# Final evaluation on the test set
final_y_pred = best_pipeline.predict(X_test)
final_rmse = mean_squared_error(y_test, final_y_pred, squared=False)
final_r2 = r2_score(y_test, final_y_pred)
print(f'Final model - RMSE: {final_rmse}, R²: {final_r2}')


Linear Regression - RMSE: 1048377.8954173692, R²: -3.313421908741585
Random Forest - RMSE: 328168.5317942891, R²: 0.5773507554563654
Gradient Boosting - RMSE: 335950.2096867705, R²: 0.5570690152593278
XGBoost - RMSE: 324795.202749959, R²: 0.5859951383625199
Best model: XGBoost
Best parameters for XGBoost: {'regressor__learning_rate': 0.2, 'regressor__max_depth': 7, 'regressor__n_estimators': 200}
Final model - RMSE: 324148.9920587084, R²: 0.5876409029743577


In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load data
file_path = 'NLP_enriched_immoscout24.csv'
data = pd.read_csv(file_path)

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['canton', 'condition', 'city_center', 'garden', 'terrace', 'view', 'luxus']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the XGBoost regressor within a pipeline
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300, learning_rate=0.05, max_depth=5)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using root_mean_squared_error and R-squared
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')

# Feature importance (requires handling of feature names post OneHotEncoding)
encoder_features = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
features = numeric_features + list(encoder_features)
importances = pipeline.named_steps['regressor'].feature_importances_

# Print sorted feature importance
feature_importance_dict = dict(zip(features, importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
for name, importance in sorted_features:
    print(f"{name}: {importance}")

# Hyperparameter tuning for better performance
param_grid = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Predict with the best model
y_pred_best = best_model.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)
print(f'Best Model Root Mean Squared Error: {rmse_best}')
print(f'Best Model R-squared: {r2_best}')

# Feature importance of the best model
best_importances = best_model.named_steps['regressor'].feature_importances_
best_feature_importance_dict = dict(zip(features, best_importances))
sorted_best_features = sorted(best_feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
for name, importance in sorted_best_features:
    print(f"{name}: {importance}")


Root Mean Squared Error: 323783.9383847122
R-squared: 0.5885691701755038
canton_Geneva: 0.14813823997974396
canton_Jura: 0.09577798843383789
canton_Zurich: 0.09133481979370117
Living Space (sqm): 0.06614197045564651
canton_Zug: 0.053356271237134933
canton_Neuchatel: 0.04513109102845192
canton_Solothurn: 0.04482155293226242
canton_Valais: 0.04110576584935188
canton_Schwyz: 0.0392439141869545
canton_Vaud: 0.03872501477599144
canton_Basel-landschaft: 0.029934730380773544
canton_Fribourg: 0.02849418856203556
canton_Basel-stadt: 0.028417708352208138
canton_Lucerne: 0.026595642790198326
canton_Graubuenden: 0.025104302912950516
canton_Bern: 0.023697303608059883
luxus_False: 0.015972863882780075
canton_Ticino: 0.014517085626721382
canton_Nidwalden: 0.014120751991868019
canton_St-gallen: 0.013703402131795883
view_False: 0.01272311620414257
canton_Schaffhausen: 0.01177024096250534
canton_Glarus: 0.010760042816400528
canton_Obwalden: 0.010484874248504639
condition_renovated: 0.008455581963062286


In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load data
file_path = 'NLP_enriched_immoscout24.csv'
data = pd.read_csv(file_path)

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Filter out properties with prices under 200,000
data = data[data['Price'] >= 200000]

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['canton', 'condition', 'city_center', 'garden', 'terrace', 'view', 'luxus']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
# Adjust OneHotEncoder to drop 'False' categories
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='if_binary'), categorical_features)  # Drop the 'False' categories for binary features
    ])

# Define the XGBoost regressor within a pipeline
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300, learning_rate=0.05, max_depth=5)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using root_mean_squared_error and R-squared
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')

# Feature importance (requires handling of feature names post OneHotEncoding)
encoder_features = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
features = numeric_features + list(encoder_features)
importances = pipeline.named_steps['regressor'].feature_importances_

# Print sorted feature importance
feature_importance_dict = dict(zip(features, importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
for name, importance in sorted_features:
    print(f"{name}: {importance}")

# Hyperparameter tuning for better performance
param_grid = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_

# Predict with the best model
y_pred_best = best_model.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)
print(f'Best Model Root Mean Squared Error: {rmse_best}')
print(f'Best Model R-squared: {r2_best}')

# Feature importance of the best model
best_importances = best_model.named_steps['regressor'].feature_importances_
best_feature_importance_dict = dict(zip(features, best_importances))
sorted_best_features = sorted(best_feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
for name, importance in sorted_best_features:
    print(f"{name}: {importance}")


Root Mean Squared Error: 320778.8291153795
R-squared: 0.5924448548915662
canton_Geneva: 0.14742432534694672
canton_Jura: 0.1008109450340271
canton_Zurich: 0.09320199489593506
Living Space (sqm): 0.07012826949357986
canton_Zug: 0.0586274228990078
canton_Neuchatel: 0.049328550696372986
canton_Solothurn: 0.044400714337825775
canton_Valais: 0.04405668377876282
canton_Vaud: 0.041255347430706024
canton_Schwyz: 0.03397664800286293
canton_Fribourg: 0.03007218800485134
canton_Basel-stadt: 0.02975667268037796
canton_Bern: 0.02420508675277233
canton_Lucerne: 0.02143046259880066
canton_Basel-landschaft: 0.021337559446692467
canton_Graubuenden: 0.020872266963124275
luxus_True: 0.016074851155281067
canton_Nidwalden: 0.014581868425011635
canton_Obwalden: 0.012249794788658619
canton_Glarus: 0.011830893345177174
condition_renovated: 0.01088027935475111
canton_Schaffhausen: 0.010377221740782261
canton_Ticino: 0.009930566884577274
canton_St-gallen: 0.009791843593120575
view_True: 0.00801895186305046
cond