# Housing Price Prediction - Model Training
This notebook trains a linear regression model to predict housing prices using the Boston Housing dataset.

## Workflow:
1. Load configuration from YAML
2. Load and prepare data
3. Train linear regression model
4. Evaluate model performance
5. Save model and metrics

# Install required packages
import subprocess
import sys

packages = ['pyyaml', 'pandas', 'scikit-learn', 'numpy']
for package in packages:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '-q'])

In [None]:
# Import libraries
import sys
import os
import yaml
import numpy as np
import pandas as pd
import json
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Add src to path for imports
sys.path.insert(0, '/Workspace/Repos/main/ml-regression-model')  # Adjust path for your workspace

print("Libraries imported successfully")

In [None]:
# Load configuration
config_path = '/Workspace/Repos/main/ml-regression-model/config/model_config.yaml'

try:
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    
    model_config = config['model_config']
    print("Configuration loaded successfully!")
    print(f"Model: {model_config['name']}")
    print(f"Version: {model_config['version']}")
except Exception as e:
    print(f"Error loading config: {e}")
    # Use default config if file not found
    model_config = {
        'name': 'Housing Price Prediction',
        'version': '1.0.0',
        'data': {'train_test_split': 0.2, 'random_state': 42},
        'features': {'numerical_features': ['RM', 'LSTAT', 'AGE'], 'target_feature': 'MEDV'},
        'training': {'test_size': 0.2, 'random_state': 42}
    }

In [None]:
# Load Boston Housing Dataset
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['MEDV'] = boston.target

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nDataset info:")
print(df.info())

In [None]:
# Prepare features and target
feature_cols = model_config['features']['numerical_features']
target_col = model_config['features']['target_feature']

# Check if all features exist in dataset
available_features = [f for f in feature_cols if f in df.columns]
print(f"Using features: {available_features}")

X = df[available_features].values
y = df[target_col].values

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Split data into train and test sets
test_size = model_config['training']['test_size']
random_state = model_config['training']['random_state']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=test_size, 
    random_state=random_state
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Training set percentage: {X_train.shape[0] / len(X) * 100:.1f}%")

In [None]:
# Train Linear Regression Model
print("Training model...")
model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)
print("Model trained successfully!")

# Model coefficients
print(f"\nModel coefficients:")
for feature, coef in zip(available_features, model.coef_):
    print(f"  {feature}: {coef:.4f}")
print(f"  Intercept: {model.intercept_:.4f}")

In [None]:
# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Evaluate on training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Evaluate on test set
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Model Evaluation Results")
print("=" * 50)
print(f"\nTraining Set Metrics:")
print(f"  MSE:  {train_mse:.4f}")
print(f"  RMSE: {train_rmse:.4f}")
print(f"  MAE:  {train_mae:.4f}")
print(f"  R²:   {train_r2:.4f}")

print(f"\nTest Set Metrics:")
print(f"  MSE:  {test_mse:.4f}")
print(f"  RMSE: {test_rmse:.4f}")
print(f"  MAE:  {test_mae:.4f}")
print(f"  R²:   {test_r2:.4f}")

In [None]:
# Save metrics
metrics = {
    "model_name": model_config['name'],
    "model_version": model_config['version'],
    "training_metrics": {
        "mse": float(train_mse),
        "rmse": float(train_rmse),
        "mae": float(train_mae),
        "r2_score": float(train_r2)
    },
    "test_metrics": {
        "mse": float(test_mse),
        "rmse": float(test_rmse),
        "mae": float(test_mae),
        "r2_score": float(test_r2)
    }
}

# Save metrics to file
metrics_output_path = '/tmp/model_metrics.json'
with open(metrics_output_path, 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"\nMetrics saved to: {metrics_output_path}")
print("\nMetrics Summary:")
print(json.dumps(metrics, indent=2))