In [147]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [148]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

train_df.drop(columns=["Id"], inplace=True)

train_df.dropna(subset=["SalePrice"], inplace=True)

X = train_df.drop(columns=["SalePrice"])
y = train_df["SalePrice"]

In [149]:
# ========== Train/Test Split ==========
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cleaning:

In [150]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class DropHighNaNColumns(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.3):
        self.threshold = threshold
        self.columns_to_keep_ = None

    def fit(self, X, y=None):
        nan_ratio = pd.isnull(X).mean()
        self.columns_to_keep_ = nan_ratio[nan_ratio <= self.threshold].index
        return self

    def transform(self, X):
        return X[self.columns_to_keep_]


# Feature Engineering

In [151]:
# ========== Preprocessing ==========
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, make_column_selector(dtype_include=['int64', 'float64'])),
    ('cat', cat_pipe, make_column_selector(dtype_include=['object']))
])

# ========== Pipeline & Param Grid ==========
model_pipeline = Pipeline([
    ('clean', DropHighNaNColumns()),
    ('feature_engineering', preprocessor),
    ('model', Ridge())
])


In [158]:
# Define model names, pipelines, and grid settings
model_names = ['Linear', 'Ridge', 'RandomForest', 'XGBoost']
pipelines = [
    model_pipeline.set_params(model=LinearRegression()),
    model_pipeline.set_params(model=Ridge()),
    model_pipeline.set_params(model=RandomForestRegressor()),
    model_pipeline.set_params(model=xgb.XGBRegressor(eval_metric='rmse', verbosity=0))
]
grids = [
    {
        'clean__threshold': [0.1, 0.2],
        'model__alpha': [0.01, 0.1, 1.0, 10.0]
    },
    {
        'clean__threshold': [0.1, 0.2],
        'model__alpha': [0.01, 0.1, 1.0, 10.0]
    },
    {
        'clean__threshold': [0.1, 0.2],
        'model__n_estimators': [100, 200],
        'model__max_depth': [5, 10]
    },
    {
        'clean__threshold': [0.1],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1],
        'model__max_depth': [3, 6]
    }
]


# Training and Logging

In [159]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import dagshub

dagshub.init(repo_owner='ashar-22', repo_name='hw01ml', mlflow=True)

model_results = []  # Store results per model

for model_name, pipeline, grid in zip(model_names, pipelines, grids):
    print(f"\n🔍 Training {model_name}...")

    search = GridSearchCV(pipeline, [grid], cv=3, scoring='neg_root_mean_squared_error', error_score='raise')
    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    y_pred = best_model.predict(X_test)
    y_test_log, y_pred_log = np.log1p(y_test), np.log1p(y_pred)

    rmse = np.sqrt(mean_squared_error(y_test_log, y_pred_log))
    mae = mean_absolute_error(y_test_log, y_pred_log)
    r2 = r2_score(y_test_log, y_pred_log)

    model_results.append({
        "model": model_name,
        "rmse": rmse,
        "mae": mae,
        "r2": r2,
        "search": search
    })

    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_type", model_name)
        for param, value in search.best_params_.items():
            mlflow.log_param(param, value)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        mlflow.sklearn.log_model(best_model, artifact_path="model")

# Display Results per Model
print("\n📊 Best Results Per Model:")
for result in model_results:
    print(f"{result['model']}:\n  RMSE (log1p): {result['rmse']:.4f}\n  MAE (log1p): {result['mae']:.4f}\n  R² (log1p): {result['r2']:.4f}")

# Display Best Model Overall
best_result = min(model_results, key=lambda x: x['rmse'])
print(f"\n🏆 Best overall model: {best_result['model']} with RMSE (log1p): {best_result['rmse']:.4f}")



🔍 Training Linear...




🏃 View run Linear at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/1/runs/76649263b6c443d8bf48d931e6dd8966
🧪 View experiment at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/1

🔍 Training Ridge...




🏃 View run Ridge at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/1/runs/6ac42a92c084479f8bc6a6f05afb7b83
🧪 View experiment at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/1

🔍 Training RandomForest...




🏃 View run RandomForest at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/1/runs/2f6d8a93f124427c80cd8615d9e02d8c
🧪 View experiment at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/1

🔍 Training XGBoost...




🏃 View run XGBoost at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/1/runs/0f92d785a0ad49ff9dc04825f97565b0
🧪 View experiment at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/1

📊 Best Results Per Model:
Linear:
  RMSE (log1p): 0.1465
  MAE (log1p): 0.0982
  R² (log1p): 0.8849
Ridge:
  RMSE (log1p): 0.1465
  MAE (log1p): 0.0982
  R² (log1p): 0.8849
RandomForest:
  RMSE (log1p): 0.1446
  MAE (log1p): 0.0993
  R² (log1p): 0.8879
XGBoost:
  RMSE (log1p): 0.1363
  MAE (log1p): 0.0923
  R² (log1p): 0.9005

🏆 Best overall model: XGBoost with RMSE (log1p): 0.1363


# Submission

In [160]:
test_df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

ids = test_df["Id"]
test_df.drop(columns=["Id"], inplace=True)
preds = best_model.predict(test_df)

submission = pd.DataFrame({"Id": ids, "SalePrice": preds})
submission.to_csv("/kaggle/working/submission.csv", index=False)

print("\n✅ submission.csv saved!")



✅ submission.csv saved!
