In [37]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Filter out records with price under 200K
data = data[data['Price'] >= 200000]

# Define the upper and lower bounds for the outlier filter
lower_bound = np.percentile(data['Price'], 5)
upper_bound = np.percentile(data['Price'], 95)

# Filter out outliers
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the XGBoost regressor within a pipeline
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300, learning_rate=0.05, max_depth=5)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')

# Feature importance (requires handling of feature names post OneHotEncoding)
encoder_features = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
features = numeric_features + list(encoder_features)
importances = pipeline.named_steps['regressor'].feature_importances_

# Print sorted feature importance
feature_importance_dict = dict(zip(features, importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
for name, importance in sorted_features:
    print(f"{name}: {importance}")


Root Mean Squared Error: 340502.2937540621
Canton_Geneva: 0.12814883887767792
Canton_Jura: 0.10566408187150955
Canton_Zurich: 0.08730731159448624
Canton_Vaud: 0.08310278505086899
Canton_Valais: 0.05760432779788971
Canton_Solothurn: 0.056514643132686615
Canton_Zug: 0.05198739096522331
Living Space (sqm): 0.04952268302440643
Canton_Neuchatel: 0.03938717767596245
Canton_Schwyz: 0.03513162210583687
Canton_Basel-Stadt: 0.03308979049324989
Canton_Graubuenden: 0.032897867262363434
Canton_Fribourg: 0.029464757069945335
Canton_Basel-Landschaft: 0.024899912998080254
Canton_Bern: 0.02226150780916214
Canton_Lucerne: 0.01895548216998577
Canton_Nidwalden: 0.017644226551055908
Canton_Aargau: 0.014963408000767231
Canton_Glarus: 0.012124825268983841
Canton_Obwalden: 0.011041978374123573
Canton_Schaffhausen: 0.010592866688966751
Canton_St-Gallen: 0.01051325537264347
condition_renovated: 0.009547403082251549
Canton_Ticino: 0.008322064764797688
Canton_Thurgau: 0.008267305791378021
condition_new: 0.0082237



In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Filter out records with price under 200K
data = data[data['Price'] >= 200000]

# Define the upper and lower bounds for the outlier filter
lower_bound = np.percentile(data['Price'], 5)
upper_bound = np.percentile(data['Price'], 95)

# Filter out outliers
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the Random Forest regressor within a pipeline
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', rf_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 345075.5560303169




In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Filter out records with price under 200K
data = data[data['Price'] >= 200000]

# Define the upper and lower bounds for the outlier filter
lower_bound = np.percentile(data['Price'], 5)
upper_bound = np.percentile(data['Price'], 95)

# Filter out outliers
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the Gradient Boosting regressor within a pipeline
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', gb_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 354100.195353484




In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Filter out records with price under 200K
data = data[data['Price'] >= 200000]

# Define the upper and lower bounds for the outlier filter
lower_bound = np.percentile(data['Price'], 5)
upper_bound = np.percentile(data['Price'], 95)

# Filter out outliers
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the SVR regressor within a pipeline
svr_model = SVR(kernel='rbf')

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', svr_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 527243.8304526691




In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Filter out records with price under 200K
data = data[data['Price'] >= 200000]

# Define the upper and lower bounds for the outlier filter
lower_bound = np.percentile(data['Price'], 5)
upper_bound = np.percentile(data['Price'], 95)

# Filter out outliers
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the LightGBM regressor within a pipeline
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', lgb_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000491 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 395
[LightGBM] [Info] Number of data points in the train set: 8352, number of used features: 31
[LightGBM] [Info] Start training from score 1252588.835010
Root Mean Squared Error: 355127.9651538752




In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Filter out records with price under 200K
data = data[data['Price'] >= 200000]

# Define the upper and lower bounds for the outlier filter
lower_bound = np.percentile(data['Price'], 5)
upper_bound = np.percentile(data['Price'], 95)

# Filter out outliers
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the AdaBoost regressor within a pipeline
adaboost_model = AdaBoostRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', adaboost_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 430997.0081214088




In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Filter out records with price under 200K
data = data[data['Price'] >= 200000]

# Define the upper and lower bounds for the outlier filter
lower_bound = np.percentile(data['Price'], 5)
upper_bound = np.percentile(data['Price'], 95)

# Filter out outliers
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the Linear Regression model within a pipeline
linear_model = LinearRegression()

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', linear_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 383902.0481279659




In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Filter out records with price under 200K
data = data[data['Price'] >= 200000]

# Define the upper and lower bounds for the outlier filter
lower_bound = np.percentile(data['Price'], 5)
upper_bound = np.percentile(data['Price'], 95)

# Filter out outliers
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the MLP Regression model within a pipeline
mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', mlp_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')


Root Mean Squared Error: 391491.8754797078


