In [9]:
import numpy as np
import pandas as pd
from sklearn import datasets

# Data import
data = datasets.fetch_california_housing()

df = pd.DataFrame(data['data'], columns = data['feature_names'])
df[data['target_names'][0]] = data['target']

df.head()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [10]:
from sklearn.model_selection import train_test_split

# feature and target columns
X = df.drop(columns='MedHouseVal')
y = df['MedHouseVal']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Dataset shape:", df.shape)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Dataset shape: (20640, 9)
Training set size: (16512, 8)
Testing set size: (4128, 8)


In [17]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def model_trainer(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, models: dict) -> pd.DataFrame:
    
    """
    Trains multiple regression models and evaluates their performance.
    
    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target variable.
        X_test (pd.DataFrame): Testing features.
        y_test (pd.Series): Testing target variable.
        models (dict): Dictionary of model names and their corresponding sklearn model instances.
        
    Returns:
        pd.DataFrame: DataFrame containing model names and their evaluation metrics.
    """
    
    results = {
        'Models': [],
        'Mean Squared Error': [],
        'R^2 Score': [],
        'Mean Absolute Error': [],
        'Root Mean Squared Error': []
    }
    
    # Iterate through the models
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        # Fit the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        # Append results
        results['Models'].append(model_name)
        results['Mean Squared Error'].append(mse)
        results['R^2 Score'].append(r2)
        results['Mean Absolute Error'].append(mae)
        results['Root Mean Squared Error'].append(rmse)
        
    return pd.DataFrame(results)
    

In [21]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Define the models to be trained
# We defined the model with default hyperparameters for simplicity.
# In practice, you may want to tune these hyperparameters for better performance.
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Support Vector Regressor': SVR(),
    'K-Neighbors Regressor': KNeighborsRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    'XGBoost Regressor': XGBRegressor(),
    'LightGBM Regressor': LGBMRegressor()
}

# Train the models and get the results
results_df = model_trainer(X_train, y_train, X_test, y_test, models)

Training Linear Regression...
Training Ridge Regression...
Training Lasso Regression...
Training Support Vector Regressor...
Training K-Neighbors Regressor...
Training Decision Tree Regressor...
Training Random Forest Regressor...
Training Gradient Boosting Regressor...
Training XGBoost Regressor...
Training LightGBM Regressor...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001412 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.071947


In [22]:
results_df.sort_values(by='R^2 Score', ascending=False, inplace=True)

results_df

Unnamed: 0,Models,Mean Squared Error,R^2 Score,Mean Absolute Error,Root Mean Squared Error
9,LightGBM Regressor,0.214848,0.836045,0.307755,0.463517
8,XGBoost Regressor,0.224583,0.828616,0.311302,0.473902
6,Random Forest Regressor,0.25396,0.806198,0.325726,0.503945
7,Gradient Boosting Regressor,0.293997,0.775645,0.371643,0.542215
5,Decision Tree Regressor,0.503822,0.615523,0.455938,0.709804
1,Ridge Regression,0.555803,0.575855,0.533204,0.745522
0,Linear Regression,0.555892,0.575788,0.5332,0.745581
2,Lasso Regression,0.938034,0.284167,0.761578,0.968521
4,K-Neighbors Regressor,1.118682,0.14631,0.812798,1.057678
3,Support Vector Regressor,1.332012,-0.016485,0.859951,1.154128
