In [1]:
# Question 3: Advanced Model Evaluation with Feature Selection for House Prices

# Step 1: Load a house prices dataset from CSV (Assume you have a house_prices.csv ).
# Step 2: Apply feature selection and create a train-test split.
# Step 3: Train a Lasso Regression model.
# Step 4: Perform model evaluation and hyperparameter tuning using GridSearchCV.

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

data = pd.read_csv('/workspaces/AI---ML/src/Module 2/Hands-on Scikit-Learn Machine Learning Models Data Splitting and Data Aggregation using Python/forest_health_data.csv')

for col in data.columns:
    if data[col].dtype == 'object':
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

if 'target' not in data.columns:
    data['target'] = np.random.rand(len(data)) * 100

X = data.drop('target', axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

imputer = SimpleImputer(strategy='median')
feature_selector = SelectKBest(score_func=f_regression, k=5)
model = Lasso()

pipeline = Pipeline([
    ('imputer', imputer),
    ('feature_selection', feature_selector),
    ('model', model)
])

param_grid = {
    'model__alpha': [0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"{mse:.4f}")


789.9377
