In [2]:
# IMPORT LIBRARIES
import pandas as pd
import os
import numpy as np
import pickle
import plotly.express as px
from matplotlib import pyplot as plt
from scipy import stats
import seaborn as sns
from scipy.stats import skew, kurtosis
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna
import time

In [3]:
df_ml = pd.read_csv(r"C:\Users\faval\Desktop\Ironhack\DataAnalytics\final_project\data\clean\transformed_idealista_output_etl.csv", low_memory=False)

In [4]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
df_ml.info()

In [None]:
df_ml["ad_price_cap_log"].describe()

## MACHINE LEARNING MODELS

**CHECKING NORMAL DISTRIBUTION**

In [None]:
# Histogram
plt.figure(figsize=(10, 5))
sns.histplot(df_ml['ad_price_cap_log'], kde=True)
plt.title('Histogram of ad_price_cap_log')
plt.xlabel('ad_price_cap_log')
plt.ylabel('Frequency')
plt.show()

# Q-Q Plot
plt.figure(figsize=(6, 6))
stats.probplot(df_ml['ad_price_cap_log'], dist="norm", plot=plt)
plt.title('Q-Q Plot')
plt.show()

In [None]:
# Shapiro-Wilk Test
shapiro_test = stats.shapiro(df_ml['ad_price_cap_log'].dropna())
print(f"Shapiro-Wilk Test: Statistic={shapiro_test.statistic}, p-value={shapiro_test.pvalue}")

In [None]:
# Kolmogorov-Smirnov Test
ks_test = stats.kstest(df_ml['ad_price_cap_log'].dropna(), 'norm', args=(df_ml['ad_price_cap_log'].mean(), df_ml['ad_price_cap_log'].std()))
print(f"Kolmogorov-Smirnov Test: Statistic={ks_test.statistic}, p-value={ks_test.pvalue}")

In [None]:
# Anderson-Darling Test
ad_test = stats.anderson(df_ml['ad_price_cap_log'].dropna(), dist='norm')
print(f"Anderson-Darling Test: Statistic={ad_test.statistic}, Critical Values={ad_test.critical_values}, Significance Levels={ad_test.significance_level}")

NOT NORMAL DISTRIBUTION

**TRAIN/TEST SPLIT**

In [8]:
# Identify your target variable and features
X = df_ml.drop(columns=['ad_price_cap_log'])  # Features
y = df_ml['ad_price_cap_log']  # Target variable
# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

**1. LINEAR REGRESSION**

In [10]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
y_pred_linear_real = np.e

In [12]:
# Evaluate Linear Regression
print("Linear Regression Metrics:")
print("MAE:", mean_absolute_error(y_test, y_pred_linear))
print("MSE:", mean_squared_error(y_test, y_pred_linear))
print("R²:", r2_score(y_test, y_pred_linear))

Linear Regression Metrics:
MAE: 0.45429428975351177
MSE: 0.4837013595097949
R²: 0.19695836079925344


The model's performance, as indicated by these metrics, shows that while it is able to make predictions, the accuracy and explanatory power are relatively weak.

**2. DECISION TREE**

In [14]:
decision_tree_model = DecisionTreeRegressor(random_state=42)
decision_tree_model.fit(X_train, y_train)
y_pred_tree = decision_tree_model.predict(X_test)

In [15]:
#Evaluate model performance
print("Decision Tree Regression Metrics:")
print("MAE:", mean_absolute_error(y_test, y_pred_tree))
print("MSE:", mean_squared_error(y_test, y_pred_tree))
print("R²:", r2_score(y_test, y_pred_tree))

Decision Tree Regression Metrics:
MAE: 0.23890890652157504
MSE: 0.1404406781728445
R²: 0.7668401996540144


**3. RANDOM FOREST REGRESSION**

In [None]:
best_params = {
    'n_estimators': 148,
    'criterion': 'squared_error',  # Keep the criterion as is
    'max_depth': 32,
    'min_samples_split': 5,
    'min_samples_leaf': 1,
    'random_state': 42}
# Create and fit the RandomForestRegressor with the best parameters
rf_model = RandomForestRegressor(**best_params)
rf_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [None]:
# Evaluate Random Forest
print("\nRandom Forest Regression Metrics:")
print("MAE:", mean_absolute_error(y_test, y_pred_rf))
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("R²:", r2_score(y_test, y_pred_rf))

**Random Forest Regression Metrics (Default Parameters):**

MAE: 0.19312418477312485

MSE: 0.08086409575163846

R²: 0.8657493208811895

**Random Forest Regression Metrics (Best Parameters):**


MAE: 0.1948369893250351

5
MSE: 0.080270796991507

88
R²: 0.866734316270398

**4. XGBOOST MODEL**

In [18]:
xgboost_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgboost_model.fit(X_train, y_train)
y_pred_xgb = xgboost_model.predict(X_test)

In [20]:
#Evaluate model performance
print("XGBoost Regression Metrics:")
print("MAE:", mean_absolute_error(y_test, y_pred_xgb))
print("MSE:", mean_squared_error(y_test, y_pred_xgb))
print("R²:", r2_score(y_test, y_pred_xgb))

XGBoost Regression Metrics:
MAE: 0.2670721427959723
MSE: 0.12375955711987068
R²: 0.7945342189713509


Based on the metrics and interpretations, I will choose the Random Forest model for predicting apartment prices.

Its stronger performance metrics indicate that it is more capable of handling the complexities of the data and capturing non-linear relationships.

**CROSS VALIDATION**

In [None]:
rf_model = RandomForestRegressor(random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=kf, scoring='neg_mean_absolute_error')
mae_scores = -cv_scores

In [None]:
#Evaluate cross-validation results
print("Cross-Validation MAE Scores:", mae_scores)
print("Mean MAE:", mae_scores.mean())

Cross-Validation MAE Scores: [0.19300451 0.19281122 0.19070589 0.19217698 0.19240987]

Mean MAE: 0.19222169438252262

The consistency of these scores across different folds indicates that your model performs reliably on different subsets of your data, which is a good sign of robustness.

**HYPERPARAMETER TUNING FOR RANDOM FOREST**

In [None]:
def objective(trial):
    # Suggest values for the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    
    # Create a Random Forest Regressor with suggested hyperparameters
    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    # Fit the model
    rf_model.fit(X_train, y_train)
    
    # Predict and return the MSE
    y_pred_rf = rf_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred_rf)
    return mse

# Create a study object
study = optuna.create_study(direction='minimize')

# Time monitoring
n_trials = 100
elapsed_times = []  # Store elapsed times for each trial

for i in range(n_trials):
    start_time = time.time()
    
    # Optimize the objective function
    study.optimize(objective, n_trials=1)
    
    elapsed_time = time.time() - start_time
    elapsed_times.append(elapsed_time)  # Store elapsed time
    
    print(f"Trial {i + 1}/{n_trials} took {elapsed_time:.2f} seconds")
    
    # Estimate total time using average elapsed time
    average_time_per_trial = sum(elapsed_times) / len(elapsed_times)
    estimated_total = average_time_per_trial * n_trials
    print(f"Estimated total time: {estimated_total / 60:.2f} minutes")

# Output the best hyperparameters
print("Best hyperparameters: ", study.best_params)
print("Best MSE: ", study.best_value)

Trial 100/100 took 202.77 seconds
Estimated total time: 276.37 minutes
Best hyperparameters:  {'n_estimators': 148, 'max_depth': 32, 'min_samples_split': 5, 'min_samples_leaf': 1}
Best MSE:  0.08027079699150788

**SAVE MODEL**

In [None]:
# Save the model with pickle
model_file_path = r"C:\Users\faval\Desktop\Ironhack\DataAnalytics\final_project\moDels\rf_model.pkl"
with open(model_file_path, "wb") as file:
    pickle.dump(rf_model, file)

**FEATURE IMPORTANCE**

In [None]:
# Step 1: Get feature importances from the trained model
feature_importances = rf_model.feature_importances_
# Step 2: Create a DataFrame to hold features along with their importance
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances})
# Step 3: Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=True)
# Step 4: Display the top features
importance_df

In [None]:
# Visualize feature importances using a bar chart
plt.figure(figsize=(12, 6))
base_color = np.array([135/255, 206/255, 235/255])  # Light blue in RGB
pastel_tones = [base_color * (0.1 * i) for i in range(10)]
# Create a horizontal bar chart with varying tones of the base pastel color
plt.barh(importance_df['Feature'][:10], importance_df['Importance'][:10], color=pastel_tones)
# Add labels and title with improved font size and style
plt.xlabel('Importance Score', fontsize=14, fontweight='bold')
plt.ylabel('Features', fontsize=14, fontweight='bold')
plt.title('Top 10 Feature Importances in Random Forest Model', fontsize=16, fontweight='bold')
# Add gridlines for better visual appeal
plt.grid(axis='x', linestyle='--', alpha=0.7)
# Show the plot
plt.show()