<a href="https://colab.research.google.com/github/amirmohammadkalateh/global_housing_market_extended/blob/main/global_housing_market_extended.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
# Read and prepare data
df = pd.read_csv('global_housing_market_extended.csv')

In [4]:
# Select features and target
features = ['Rent Index', 'Affordability Ratio', 'Mortgage Rate (%)',
            'GDP Growth (%)', 'Population Growth (%)', 'Urbanization Rate (%)',
            'Construction Index']
X = df[features]
y = df['House Price Index']

In [5]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Scale features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Train model with efficient settings
model = LinearRegression(n_jobs=-1)  # Use all CPU cores
model.fit(X_train_scaled, y_train)

In [8]:
# Evaluate model
train_score = model.score(X_train_scaled, y_train)
test_score = model.score(X_test_scaled, y_test)

In [9]:
# Print results
print(f"Training R² score: {train_score:.4f}")
print(f"Testing R² score: {test_score:.4f}")
print("\nFeature importance:")
for feature, importance in zip(features, model.coef_):
    print(f"{feature}: {importance:.4f}")


Training R² score: 0.0705
Testing R² score: -0.0038

Feature importance:
Rent Index: 2.2739
Affordability Ratio: -4.4330
Mortgage Rate (%): 0.9827
GDP Growth (%): -1.9063
Population Growth (%): 3.2055
Urbanization Rate (%): -1.2029
Construction Index: -4.1546


# ***new one !***

In [22]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import learning_curve
import warnings
warnings.filterwarnings('ignore')

# Read and prepare data
df = pd.read_csv('global_housing_market_extended.csv')

# Feature engineering
df['GDP_per_capita'] = df['GDP Growth (%)'] / (1 + df['Population Growth (%)'])
df['Real_Interest'] = df['Mortgage Rate (%)'] - df['Inflation Rate (%)']
df['Urban_Population'] = df['Population Growth (%)'] * df['Urbanization Rate (%)']
df['Construction_GDP'] = df['Construction Index'] * df['GDP Growth (%)']
df['Price_to_Rent'] = df['House Price Index'] / df['Rent Index']
df['Urban_Density'] = df['Urban_Population'] * df['Population Growth (%)']

# Select initial features
features = ['Rent Index', 'Affordability Ratio', 'Real_Interest',
           'GDP_per_capita', 'Urban_Population', 'Construction Index',
           'Construction_GDP', 'Urban_Density', 'Inflation Rate (%)',
           'Population Growth (%)', 'Urbanization Rate (%)']

X = df[features]
y = df['House Price Index']

# Remove outliers using IQR method with less aggressive threshold
def remove_outliers(df, threshold=2.0):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    return df[~((df < (Q1 - threshold * IQR)) | (df > (Q3 + threshold * IQR))).any(axis=1)]

# Apply outlier removal
clean_data = pd.concat([X, y], axis=1)
clean_data = remove_outliers(clean_data, threshold=2.0)
X = clean_data[features]
y = clean_data['House Price Index']

# Split data with stratification based on year
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,
    stratify=df.loc[clean_data.index, 'Year'].astype(str)
)

# Feature selection using Random Forest
rf_selector = RandomForestRegressor(n_estimators=100, random_state=42)
selector = SelectFromModel(rf_selector, prefit=False)
selector.fit(X_train, y_train)
selected_features = X_train.columns[selector.get_support()].tolist()

# Use only selected features
X_train = X_train[selected_features]
X_test = X_test[selected_features]

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost with optimized parameters
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42
)

# Train and evaluate
xgb_model.fit(X_train_scaled, y_train)
train_score = xgb_model.score(X_train_scaled, y_train)
test_score = xgb_model.score(X_test_scaled, y_test)

# Cross-validation score
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=5)

print("\nModel Performance Metrics:")
print(f"Training R² score: {train_score:.4f}")
print(f"Testing R² score: {test_score:.4f}")
print(f"Cross-validation R² scores: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")




Model Performance Metrics:
Training R² score: 0.9514
Testing R² score: -0.1030
Cross-validation R² scores: -0.5058 (+/- 0.2896)
