In [147]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [148]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

train_df.drop(columns=["Id"], inplace=True)

train_df.dropna(subset=["SalePrice"], inplace=True)

X = train_df.drop(columns=["SalePrice"])
y = train_df["SalePrice"]

In [149]:
# ========== Train/Test Split ==========
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cleaning:

In [150]:
# Cleaning
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class DropHighNaNColumns(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.3):
        self.threshold = threshold
        self.columns_to_keep_ = None

    def fit(self, X, y=None):
        nan_ratio = pd.isnull(X).mean()
        self.columns_to_keep_ = nan_ratio[nan_ratio <= self.threshold].index
        return self

    def transform(self, X):
        return X[self.columns_to_keep_]


# Feature Engineering

In [151]:
# ========== Preprocessing ==========
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, make_column_selector(dtype_include=['int64', 'float64'])),
    ('cat', cat_pipe, make_column_selector(dtype_include=['object']))
])

# ========== Pipeline & Param Grid ==========
model_pipeline = Pipeline([
    ('clean', DropHighNaNColumns()),
    ('feature_engineering', preprocessor),
    ('model', Ridge())
])


In [153]:
param_grid = [
    {'clean__threshold': [0.1], 'model': [LinearRegression()]},
    {'clean__threshold': [0.1, 0.2], 'model': [Ridge()], 'model__alpha': [0.1, 1.0]},
    {'clean__threshold': [0.1], 'model': [RandomForestRegressor()], 'model__n_estimators': [100], 'model__max_depth': [5]},
    {'clean__threshold': [0.1], 'model': [xgb.XGBRegressor(eval_metric='rmse', verbosity=0)], 'model__n_estimators': [100], 'model__learning_rate': [0.1], 'model__max_depth': [3]}
]


# Training and Logging

In [154]:
# ========== MLflow Setup ==========
dagshub.init(repo_owner='ashar-22', repo_name='hw01ml', mlflow=True)
if mlflow.active_run():
    mlflow.end_run()

# ========== Train Models with Logging ==========
results = {}

with mlflow.start_run(run_name="Model Comparison"):
    grid = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    preds = best_model.predict(X_test)
    log_preds = np.log1p(preds)
    log_true = np.log1p(y_test)

    rmse = mean_squared_error(log_true, log_preds, squared=False)
    mae = mean_absolute_error(log_true, log_preds)
    r2 = r2_score(log_true, log_preds)

    model_name = type(grid.best_estimator_.named_steps['model']).__name__

    results[model_name] = {
        "rmse": rmse,
        "mae": mae,
        "r2": r2,
        "params": grid.best_params_
    }

    mlflow.log_param("best_model", model_name)
    for param, val in grid.best_params_.items():
        mlflow.log_param(param, val)

    mlflow.log_metric("rmse_log", rmse)
    mlflow.log_metric("mae_log", mae)
    mlflow.log_metric("r2_log", r2)

    mlflow.sklearn.log_model(best_model, artifact_path="model", input_example=X_train.iloc[:5])

# ========== Display Best Results Per Model Type ==========
print("\n📊 Best Results Per Model:")
for model_name, result in results.items():
    print(f"{model_name}:")
    print(f"  RMSE (log1p): {result['rmse']:.4f}")
    print(f"  MAE (log1p): {result['mae']:.4f}")
    print(f"  R² (log1p): {result['r2']:.4f}")


🏃 View run Model Comparison at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/1/runs/88f3b79132804060afe6808a98ec7514
🧪 View experiment at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/1

📊 Best Results Per Model:
XGBRegressor:
  RMSE (log1p): 0.1396
  MAE (log1p): 0.0955
  R² (log1p): 0.8955


# Submission

In [155]:
# ========== Final Prediction for Submission ==========
test_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

ids = test_df["Id"]
test_df.drop(columns=["Id"], inplace=True)
preds = best_model.predict(test_df)

submission = pd.DataFrame({"Id": ids, "SalePrice": preds})
submission.to_csv("/kaggle/working/submission.csv", index=False)

print("\n✅ submission.csv saved!")



✅ submission.csv saved!
