In [49]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

# Load data
data = pd.read_csv('final_filtered_NLP_immoscout24.csv')

# Drop unwanted columns
data = data.drop(['Address', 'Title', 'Description', 'Price_per_SquareMeter'], axis=1)

# Filter out records with price under 200K
data = data[data['Price'] >= 200000]

# Define the upper and lower bounds for the outlier filter
lower_bound = np.percentile(data['Price'], 5)
upper_bound = np.percentile(data['Price'], 95)

# Filter out outliers
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

# Select only numeric columns for median calculation
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the XGBoost regressor within a pipeline
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300, learning_rate=0.05, max_depth=5)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# Train-test split
X = data.drop('Price', axis=1)
y = data['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')

# Feature importance (requires handling of feature names post OneHotEncoding)
encoder_features = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
features = numeric_features + list(encoder_features)
importances = pipeline.named_steps['regressor'].feature_importances_

# Print sorted feature importance
feature_importance_dict = dict(zip(features, importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
for name, importance in sorted_features:
    print(f"{name}: {importance}")


Root Mean Squared Error: 340502.2937540621
Canton_Geneva: 0.12814883887767792
Canton_Jura: 0.10566408187150955
Canton_Zurich: 0.08730731159448624
Canton_Vaud: 0.08310278505086899
Canton_Valais: 0.05760432779788971
Canton_Solothurn: 0.056514643132686615
Canton_Zug: 0.05198739096522331
Living Space (sqm): 0.04952268302440643
Canton_Neuchatel: 0.03938717767596245
Canton_Schwyz: 0.03513162210583687
Canton_Basel-Stadt: 0.03308979049324989
Canton_Graubuenden: 0.032897867262363434
Canton_Fribourg: 0.029464757069945335
Canton_Basel-Landschaft: 0.024899912998080254
Canton_Bern: 0.02226150780916214
Canton_Lucerne: 0.01895548216998577
Canton_Nidwalden: 0.017644226551055908
Canton_Aargau: 0.014963408000767231
Canton_Glarus: 0.012124825268983841
Canton_Obwalden: 0.011041978374123573
Canton_Schaffhausen: 0.010592866688966751
Canton_St-Gallen: 0.01051325537264347
condition_renovated: 0.009547403082251549
Canton_Ticino: 0.008322064764797688
Canton_Thurgau: 0.008267305791378021
condition_new: 0.0082237



In [48]:
import pandas as pd

# Read the CSV file with delimiter ';'
data = pd.read_csv('cantons_enriched_no_blanks_immoscout24.csv', delimiter=';')

# Extract records with "price on request" (accounting for leading/trailing whitespaces)
price_on_request_records = data[data['Price'].str.strip() == 'Price on request']

# Save the extracted records to a new CSV file
price_on_request_records.to_csv('price_on_request_records.csv', index=False, sep=';')

print("Price on request records extracted and saved successfully.")


Price on request records extracted and saved successfully.


In [69]:
import pandas as pd

# Read the CSV file with delimiter ';'
data = pd.read_csv('enriched_price_on_request_records.csv')

# Clean room sizes
data['Rooms'] = data['Rooms'].str.extract(r'(\d+\.\d+|\d+)').astype(float)

# Clean area sizes
data['Living Space (sqm)'] = data['Living Space (sqm)'].str.extract(r'(\d+)').astype(float)  # Convert to float first to handle NaN

# Fill missing values with 0 (you can replace 0 with any other appropriate value)
data['Living Space (sqm)'] = data['Living Space (sqm)'].fillna(0)

# Remove extreme values
lower_bound = 0  # Define lower bound for living space
upper_bound = 1000  # Define upper bound for living space
data = data[(data['Living Space (sqm)'] >= lower_bound) & (data['Living Space (sqm)'] <= upper_bound)]

# Convert to integer type
data['Living Space (sqm)'] = data['Living Space (sqm)'].astype(int)

# Convert 'Nearest Station Distance (m)' to numeric, coerce errors will replace non-numeric values with NaN
data['Nearest Station Distance (m)'] = pd.to_numeric(data['Nearest Station Distance (m)'], errors='coerce')

# Replace infinite values with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows where 'Nearest Station Distance (m)' is NaN
data = data.dropna(subset=['Nearest Station Distance (m)'])

# Save the cleaned dataset to a new CSV file
data.to_csv('cleaned_enriched_price_on_request_records.csv', index=False, sep=';')

print("Dataset cleaned and saved successfully.")


Dataset cleaned and saved successfully.


In [70]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load cleaned data
data_to_predict = pd.read_csv('cleaned_enriched_price_on_request_records.csv', delimiter=';')

# Define categorical and numeric features
categorical_features = ['Canton', 'condition']
numeric_features = data_to_predict.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Check if 'Price' column exists in numeric_features list before attempting to remove it
if 'Price' in numeric_features:
    numeric_features.remove('Price')  # Exclude the target variable

# Preprocessing for numeric and categorical data (use the same preprocessor as during training)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the XGBoost regressor within a pipeline (use the same pipeline as during training)
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=300, learning_rate=0.05, max_depth=5)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# Load the trained model
pipeline.fit(X_train, y_train)

# Make predictions
predictions = pipeline.predict(data_to_predict)

# Add predictions to the DataFrame
data_to_predict['Predicted Price'] = predictions

# Save the predictions to a new CSV file
data_to_predict.to_csv('predicted_prices.csv', index=False)

print("Predictions saved successfully.")


Predictions saved successfully.
