In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# --- Import Custom Modules ---
# This uses the corrected 'load_data' and your preprocessing pipeline
try:
    from src.utils import load_data
    from src.data_preprocessing import preprocess_data, split_data
    print("Project modules imported successfully.")  
except ImportError as e:
    print(f"Import Error: Double check that src/utils.py has the function 'load_data' (not load_csv). {e}")

# Set plot style
plt.style.use('seaborn-v0_8')

In [None]:
# Load data
df = load_data()
print(f"Data loaded successfully: {df.shape}")
df.head()

In [None]:
# Preprocessing
print("Preprocessing data...")
X, y, pipeline = preprocess_data(df)
print(f"Data shape after preprocessing: {X.shape}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = split_data(X, y)
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

In [None]:
# Baseline Model: Linear Regression
print("Training Linear Regression model...")
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

train_preds = lin_reg.predict(X_train)
lin_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
print(f"Linear Regression RMSE: {lin_rmse:.2f}")

In [None]:
# Random Forest Model
print("Training Random Forest model...")
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(X_train, y_train)

forest_train_preds = forest_reg.predict(X_train)
forest_rmse = np.sqrt(mean_squared_error(y_train, forest_train_preds))
print(f"Random Forest RMSE: {forest_rmse:.2f}")

In [None]:
# Final Evaluation on Test Set
print("Final evaluation on test set...")
final_predictions = forest_reg.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(f"Final RMSE on test set: {final_rmse:.2f}")

In [None]:
# Visualization
plt.figure(figsize=(10,6))
plt.scatter(y_test, final_predictions, alpha=0.1, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted House Prices')
plt.grid(True)
plt.show()