# Italy Rent Prediction - XGBoost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import time
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

pd.options.display.float_format = '{:,.2f}'.format

In [None]:
df_data = pd.read_csv('rents_clean.csv/rents_clean.csv')


In [None]:
#Check which cols has the most null values
df_data.isnull().sum()

## 1. Data Loading & Preprocessing

In [None]:
print("üìÅ LOADING DATA")

df_data = pd.read_csv('rents_clean.csv/rents_clean.csv')

df_data.columns = ['region', 'city', 'neighborhood', 'price', 'datetime', 'parking spots',
                   'bathrooms per room', 'bathrooms', 'rooms', 'top floor', 'condition',
                   'energy class', 'sea view', 'central heating', 'area', 'furnished',
                   'balcony', 'TV system', 'external exposure', 'fiber optic', 'electric gate',
                   'cellar', 'shared garden', 'private garden', 'alarm system', 'doorman',
                   'pool', 'villa', 'entire property', 'apartment', 'penthouse', 'loft', 'attic']


# Filter by price
df_filtered = df_data[(df_data['price'] < 4000) & (df_data['price'] > 0)].copy()

# Filter by area
df_filtered = df_filtered[df_filtered['area'] > 15]

# Drop duplicates and missing values
df_filtered = df_filtered.drop_duplicates()
# Drop rows where critical info is missing
df_filtered = df_filtered.dropna(subset=['price', 'area', 'city'])

print(f"‚úÖ Loaded {len(df_filtered)} rows. Starting cleaning...")

# Fill Boolean/Count features with 0
cols_to_zero = ['central heating', 'parking spots', 'balcony', 'fiber optic', 
                'electric gate', 'cellar', 'shared garden', 'sea view', 'pool',
                'furnished', 'top floor', 'external exposure']

for col in cols_to_zero:
    if col in df_filtered.columns:
        df_filtered[col] = df_filtered[col].fillna(0)

# Fill Numeric features with Median
if 'rooms' in df_filtered.columns:
    df_filtered['rooms'] = df_filtered['rooms'].fillna(df_filtered['rooms'].median())
if 'bathrooms' in df_filtered.columns:
    df_filtered['bathrooms'] = df_filtered['bathrooms'].fillna(df_filtered['bathrooms'].median())

# Fill Categorical features with 'Unknown'
cols_to_unknown = ['energy class', 'condition', 'neighborhood']
for col in cols_to_unknown:
    if col in df_filtered.columns:
        df_filtered[col] = df_filtered[col].fillna('Unknown')

# Keep only apartments
df_filtered = df_filtered[df_filtered['apartment'] == 1]

# Remove bathroom outliers (Relaxed to 3)
df_filtered = df_filtered[df_filtered['bathrooms'] <= 3]

# Fix energy class typos
df_filtered = df_filtered[df_filtered['energy class'] != ',']

# Drop unused columns
cols_to_drop = ['TV system', 'alarm system', 'doorman', 'entire property', 
                'villa', 'penthouse', 'loft', 'attic', 'apartment', 
                'datetime', 'bathrooms per room', 'private garden']
df_filtered.drop(columns=[c for c in cols_to_drop if c in df_filtered.columns], inplace=True)

int_cols = ['bathrooms', 'rooms', 'parking spots', 'top floor', 
            'central heating', 'furnished', 'balcony', 'external exposure', 
            'fiber optic', 'electric gate', 'cellar', 'shared garden']

# Fill any tiny remaining holes with 0 before converting
for col in int_cols:
    if col in df_filtered.columns:
        df_filtered[col] = df_filtered[col].fillna(0).astype(int)

# Encoding
df_filtered = pd.get_dummies(df_filtered, columns=['energy class'], drop_first=True)

print("üöÄ Data Successfully Cleaned!")
print(df_filtered.info())

In [None]:
# 1. Calculate Price per Sqm
df_filtered['price_per_sqm'] = df_filtered['price'] / df_filtered['area']

# 2. Calculate the Z-Score (How weird is this price for this city?)
# We group by city because ‚Ç¨30/m¬≤ is normal in Milan but insane in a village.
df_filtered['pps_zscore'] = df_filtered.groupby('city')['price_per_sqm'].transform(
    lambda x: (x - x.mean()) / x.std()
)

# 3. View the Anomalies (Z-score > 3 means "Statistical Freak")
anomalies = df_filtered[df_filtered['pps_zscore'] > 3]

print(f"Found {len(anomalies)} anomalies.")
# Print all anomalies
print(anomalies[['price', 'area', 'city', 'price_per_sqm']].sort_values('price_per_sqm', ascending=False))

# 4. Remove the Anomalies
# We remove the anomalies to clean our data.
df_filtered = df_filtered[df_filtered['pps_zscore'] <= 3]

# 5. Verify the Cleaned Data
print(f"Data points after removing anomalies: {len(df_filtered)}")
print(df_filtered[['price', 'area', 'city', 'price_per_sqm']].describe())

# 6. Remove the price_per_sqm column
df_filtered = df_filtered.drop(columns=['price_per_sqm'])

# 7. Remove the pps_zscore column
df_filtered = df_filtered.drop(columns=['pps_zscore'])


## 2. Feature Engineering

In [None]:
# Log transform ONLY price
df_filtered['price'] = np.log1p(df_filtered['price'])

# Function to group rare categories
def group_rare_categories(df, col, threshold):
    counts = df[col].value_counts()
    rare_values = counts[counts < threshold].index
    # Faster than .replace() for large lists
    df.loc[df[col].isin(rare_values), col] = 'Other'
    return df

# Apply it
df_filtered = group_rare_categories(df_filtered, 'region', 1000)
df_filtered = group_rare_categories(df_filtered, 'city', 300)
df_filtered = group_rare_categories(df_filtered, 'neighborhood', 50)

# Furnished and Central Heating interaction
df_filtered['Furnished and Central Heating'] = np.where(
    (df_filtered['furnished'] == 1) & (df_filtered['central heating'] == 1), 1, 0
)

In [None]:
# GEOCODING - ALL levels (region, city, neighborhood)
print("\nüåç Loading geocoding cache...")

cache_dir = 'geocoding_cache'

def load_cache(filename):
    path = os.path.join(cache_dir, filename)
    if os.path.exists(path):
        with open(path, 'r') as f:
            coords = json.load(f)
            return {k: tuple(v) if isinstance(v, list) else v for k, v in coords.items()}
    return {}

region_coordinates = load_cache('region_coordinates.json')
city_coordinates = load_cache('city_coordinates.json')
neighborhood_coordinates = load_cache('neighborhood_coordinates.json')

# Region coordinates
df_filtered['latitude'] = df_filtered['region'].map(lambda x: region_coordinates.get(x, region_coordinates.get('Other', (42, 12)))[0])
df_filtered['longitude'] = df_filtered['region'].map(lambda x: region_coordinates.get(x, region_coordinates.get('Other', (42, 12)))[1])
df_filtered.drop('region', axis=1, inplace=True)

# City coordinates
df_filtered['latitude_city'] = df_filtered['city'].map(lambda x: city_coordinates.get(x, city_coordinates.get('Other', (42, 12)))[0])
df_filtered['longitude_city'] = df_filtered['city'].map(lambda x: city_coordinates.get(x, city_coordinates.get('Other', (42, 12)))[1])
df_filtered = df_filtered.dropna(subset=['latitude_city', 'longitude_city'])
df_filtered.drop('city', axis=1, inplace=True)

# Neighborhood coordinates
df_filtered['latitude_neighborhood'] = df_filtered['neighborhood'].map(
    lambda x: neighborhood_coordinates.get(x, neighborhood_coordinates.get('Other', (42, 12)))[0]
)
df_filtered['longitude_neighborhood'] = df_filtered['neighborhood'].map(
    lambda x: neighborhood_coordinates.get(x, neighborhood_coordinates.get('Other', (42, 12)))[1]
)
df_filtered.drop('neighborhood', axis=1, inplace=True)

print(f"‚úÖ Geocoding complete - {len(df_filtered)} rows remaining")

# One-hot encode condition
encoded_df = pd.get_dummies(df_filtered, columns=['condition'], dtype=int, drop_first=True)

In [None]:
# Interaction Features
print("\nüìä Adding feature interactions")

# Rooms per area ratio (density indicator)
encoded_df['rooms_per_area'] = encoded_df['rooms'] / (encoded_df['area'] + 1)

# Bathrooms per room ratio
encoded_df['baths_per_room'] = encoded_df['bathrooms'] / (encoded_df['rooms'] + 1)

# Amenity score (sum of binary features)
amenity_cols = ['balcony', 'fiber optic', 'electric gate', 'shared garden', 'external exposure']
encoded_df['amenity_score'] = encoded_df[amenity_cols].sum(axis=1)

print(f"   Added 3 interaction features")
print(f"   Total features: {len(encoded_df.columns) - 1}")

## 3. Training the Model

In [None]:
# Stratified train/test split
print("\nüìä Stratified train/test split")

# Create price bins for stratification
y_full = encoded_df['price']
price_bins = pd.qcut(y_full, q=5, labels=['low', 'med_low', 'med', 'med_high', 'high'])

X = encoded_df.drop('price', axis=1)
y = encoded_df[['price']]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=price_bins
)

print(f"   Training set: {X_train.shape[0]} samples")
print(f"   Test set: {X_test.shape[0]} samples")
print(f"   Features: {X_train.shape[1]}")
print(f"   Feature names: {list(X_train.columns)}")

In [None]:
print("\n" + "=" * 70)
print("‚ö° XGBOOST FIXED TRAINING")
print("=" * 70)

start_time = time.time()

xgb_model = xgb.XGBRegressor(
    n_estimators=1500,      
    max_depth=6,            
    learning_rate=0.03,     
    min_child_weight=4,     
    subsample=0.7,          
    colsample_bytree=0.7,   
    gamma=0.5,              
    reg_lambda=1.5,         
    random_state=42,
    n_jobs=-1
)

print(f"\nüîß Hyperparameters:")
print(f"   n_estimators: 500")
print(f"   max_depth: 8")
print(f"   learning_rate: 0.05")
print(f"   min_child_weight: 3")
print(f"   gamma: 0.1")
print(f"   reg_alpha: 0.05")
print(f"   reg_lambda: 0.5")

xgb_model.fit(X_train, y_train.values.ravel())

elapsed_time = time.time() - start_time
print(f"\n‚úÖ Training complete in {elapsed_time:.2f} seconds")

## 4. Evaluation

In [None]:
xgb_train_score = xgb_model.score(X_train, y_train)
xgb_test_score = xgb_model.score(X_test, y_test)

print(f"\nüìà XGBoost FIXED Performance:")
print(f"   Train R¬≤: {xgb_train_score:.4f}")
print(f"   Test R¬≤:  {xgb_test_score:.4f}")
print(f"   Gap:      {xgb_train_score - xgb_test_score:.4f}")

# Predictions
xgb_train_pred = xgb_model.predict(X_train)
xgb_test_pred = xgb_model.predict(X_test)

# Convert to euros
xgb_test_pred_euro = np.expm1(xgb_test_pred)
y_test_euro = np.expm1(y_test.values.ravel())

xgb_test_mae = mean_absolute_error(y_test_euro, xgb_test_pred_euro)
xgb_test_rmse = np.sqrt(mean_squared_error(y_test_euro, xgb_test_pred_euro))

print(f"\nüí∞ Error Metrics (in Euros):")
print(f"   Test MAE:  ‚Ç¨{xgb_test_mae:.2f}")
print(f"   Test RMSE: ‚Ç¨{xgb_test_rmse:.2f}")

# Overfitting check
overfitting_gap = xgb_train_score - xgb_test_score
if overfitting_gap < 0.05:
    print(f"\n   ‚úÖ Excellent! Very low overfitting ({overfitting_gap:.4f})")
elif overfitting_gap < 0.08:
    print(f"\n   ‚úÖ Good! Overfitting under control ({overfitting_gap:.4f})")
elif overfitting_gap < 0.12:
    print(f"\n   ‚ö†Ô∏è  Mild overfitting ({overfitting_gap:.4f})")
else:
    print(f"\n   ‚ùå Significant overfitting ({overfitting_gap:.4f})")
    
results = pd.DataFrame({
    'Actual': y_test_euro,
    'Predicted': xgb_test_pred_euro
})
results['Error'] = results['Actual'] - results['Predicted']
results['Abs_Error'] = abs(results['Error'])

# Sort by biggest failures
print(results.sort_values('Abs_Error', ascending=False).head(10))

In [None]:
xgb.plot_importance(xgb_model, max_num_features=15)

In [None]:
# HIGH RENT PERFORMANCE CHECK
print("\n" + "=" * 70)
print("üìä HIGH RENT (>‚Ç¨2000) PERFORMANCE CHECK")
print("=" * 70)

high_rent_mask = y_test_euro > 2000
high_rent_actual = y_test_euro[high_rent_mask]
high_rent_pred = xgb_test_pred_euro[high_rent_mask]

if len(high_rent_actual) > 0:
    high_rent_mae = mean_absolute_error(high_rent_actual, high_rent_pred)
    high_rent_r2 = r2_score(high_rent_actual, high_rent_pred)
    
    print(f"   High rent samples: {len(high_rent_actual)}")
    print(f"   High rent MAE: ‚Ç¨{high_rent_mae:.2f}")
    print(f"   High rent R¬≤: {high_rent_r2:.4f}")

In [None]:
# CROSS-VALIDATION
print("\n" + "=" * 70)
print("üîÑ CROSS-VALIDATION")
print("=" * 70)

cv_scores = cross_val_score(xgb_model, X_train, y_train.values.ravel(), cv=5, scoring='r2', n_jobs=-1)
print(f"   CV R¬≤ Scores: {cv_scores.round(4)}")
print(f"   Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [None]:
# FEATURE IMPORTANCE
print("\n" + "=" * 70)
print("üìä FEATURE IMPORTANCE")
print("=" * 70)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Features:")
for i, row in feature_importance.head(10).iterrows():
    print(f"   {row['feature']}: {row['importance']:.4f}")


In [None]:
# Save model
print("\n" + "=" * 70)
print("üíæ SAVING MODEL")
print("=" * 70)

os.makedirs('rent_prediction_model', exist_ok=True)
joblib.dump(xgb_model, 'rent_prediction_model/rent_model_v2.pkl')

# Save feature names for reference
feature_names = list(X_train.columns)
with open('rent_prediction_model/feature_names.json', 'w') as f:
    json.dump(feature_names, f, indent=2)

print(f"‚úÖ Model saved to rent_prediction_model/rent_model_v2.pkl")
print(f"‚úÖ Feature names saved to rent_prediction_model/feature_names.json")

In [None]:
# VISUALIZATIONS
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Actual vs Predicted
ax1 = axes[0, 0]
ax1.scatter(y_test_euro, xgb_test_pred_euro, alpha=0.3, s=10, color='orange')
ax1.plot([0, 8000], [0, 8000], 'r--', label='Perfect')
ax1.set_xlabel('Actual Rent (‚Ç¨)')
ax1.set_ylabel('Predicted Rent (‚Ç¨)')
ax1.set_title(f'Actual vs Predicted (R¬≤ = {xgb_test_score:.4f})')
ax1.legend()

# Residuals
ax2 = axes[0, 1]
residuals = xgb_test_pred_euro - y_test_euro
ax2.scatter(xgb_test_pred_euro, residuals, alpha=0.3, s=10, color='orange')
ax2.axhline(y=0, color='r', linestyle='--')
ax2.set_xlabel('Predicted Rent (‚Ç¨)')
ax2.set_ylabel('Residuals (‚Ç¨)')
ax2.set_title('Residuals vs Predicted')

# Residual Distribution
ax3 = axes[1, 0]
ax3.hist(residuals, bins=50, color='orange', edgecolor='black')
ax3.axvline(x=0, color='r', linestyle='--')
ax3.set_xlabel('Residual (‚Ç¨)')
ax3.set_ylabel('Frequency')
ax3.set_title(f'Residual Distribution (Mean: ‚Ç¨{residuals.mean():.2f})')

# Feature Importance
ax4 = axes[1, 1]
top_features = feature_importance.head(15)
ax4.barh(top_features['feature'], top_features['importance'], color='orange')
ax4.set_xlabel('Importance')
ax4.set_title('Top 15 Feature Importance')
ax4.invert_yaxis()

plt.suptitle('XGBoost FIXED - Model Performance', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()