In [56]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

train_df.drop(columns=["Id"], inplace=True)

train_df.dropna(subset=["SalePrice"], inplace=True)

X = train_df.drop(columns=["SalePrice"])
y = train_df["SalePrice"]


# Cleaning:

In [57]:
# Cleaning
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class DropHighNaNColumns(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.3):
        self.threshold = threshold
        self.columns_to_keep_ = None

    def fit(self, X, y=None):
        nan_ratio = pd.isnull(X).mean()
        self.columns_to_keep_ = nan_ratio[nan_ratio <= self.threshold].index
        return self

    def transform(self, X):
        return X[self.columns_to_keep_]



# Feature Engineering:

In [58]:


# Feature Engineering
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipe = Pipeline([
  ('imputer', SimpleImputer(strategy='median')),
  ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
  ('imputer', SimpleImputer(strategy='most_frequent')),
  ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, make_column_selector(dtype_include=['int64', 'float64'])),
    ('cat', cat_pipe, make_column_selector(dtype_include=['object']))
])


# Training

In [60]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# Define the model pipeline with options for different models
model_pipeline = Pipeline([
    ('clean', DropHighNaNColumns()),
    ('feature_engineering', preprocessor),
    # ('feature_selection', SelectKBest(score_func=mutual_info_regression, k=20)),
    ('model', Ridge())  # Default model, will change with grid search
])

param_grid = [
    {
        'clean__threshold': [0.1, 0.2, 0.5],
        'model': [Ridge()],
        'model__alpha': [0.01, 0.1, 1.0, 10.0]
    },
    {
        'clean__threshold': [0.1, 0.2],
        'model': [RandomForestRegressor()],
        'model__n_estimators': [100, 200],
        'model__max_depth': [5, 10]
    },
    {
        'clean__threshold': [0.1],
        'model': [xgb.XGBRegressor(eval_metric='rmse', verbosity=0)],
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1],
        'model__max_depth': [3, 6]
    }
]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
val_preds = best_model.predict(X_test)
# Calculate RMSE
val_rmse = np.sqrt(mean_squared_error(np.log1p(y_test), np.log1p(val_preds)))
print("Validation RMSE (log scale):", val_rmse)


60 fits failed out of a total of 140.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/linear_model/_ridge.py", line 129, in _solve_sparse_cg
    coefs[i], info = sp_linalg.cg(
                     ^^^^^^^^^^^^^
TypeError: cg() got an unexpected keyword argument 'tol'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 

Validation RMSE (log scale): 0.13625557340876357


In [62]:
# Logging
import mlflow
import dagshub

dagshub.init(repo_owner='ashar-22', repo_name='hw01ml', mlflow=True)
# End any previous run if it's still active
if mlflow.active_run():
    mlflow.end_run()

with mlflow.start_run():

  mlflow.log_param("model_type", type(grid.best_estimator_.named_steps['model']).__name__)
  mlflow.log_param("best_alpha", grid.best_params_.get('model__alpha', 'N/A'))
  mlflow.log_param("best_nan_threshold", grid.best_params_['clean__threshold'])
  mlflow.log_param("best_n_estimators", grid.best_params_.get('model__n_estimators', 'N/A'))
  mlflow.log_param("best_max_depth", grid.best_params_.get('model__max_depth', 'N/A'))
  mlflow.log_param("best_learning_rate", grid.best_params_.get('model__learning_rate', 'N/A'))
  mlflow.log_metric("val_rmse_log", val_rmse)

input_example = X_train.iloc[:5]  # or use .head() — a small sample input

mlflow.sklearn.log_model(
    best_model,
    artifact_path="model",
    input_example=input_example
)


🏃 View run shivering-fly-325 at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/0/runs/847996015e4545f3873b5cdb5a03d074
🧪 View experiment at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/0
🏃 View run welcoming-hound-3 at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/0/runs/50c94704c2874675ab043143343acf0e
🧪 View experiment at: https://dagshub.com/ashar-22/hw01ml.mlflow/#/experiments/0




<mlflow.models.model.ModelInfo at 0x7c3f5a9b2550>

In [72]:
df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv") 

ids = df["Id"]

model = best_model

predictions = model.predict(df)

submission = pd.DataFrame({
    "Id": ids,
    "SalePrice": predictions
})

submission.to_csv("/kaggle/working/submission.csv", index=False)

print("✅ submission.csv saved!")

✅ submission.csv saved!
