In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin
import mlflow
import mlflow.sklearn
from sklearn.compose import make_column_selector as selector

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [7]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')

train_df.dropna(subset=['SalePrice'], inplace=True)

X = train_df.drop(columns=['SalePrice', 'Id'])
y = train_df['SalePrice']

# Cleaning:

In [10]:
class DropHighNaN(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.2):
        self.threshold = threshold
        self.to_drop = []

    def fit(self, X, y=None):
        self.to_drop = X.columns[X.isnull().mean() > self.threshold].tolist()
        return self

    def transform(self, X):
        return X.drop(columns=self.to_drop)

# Feature Engineering:

In [11]:
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, selector(dtype_include=['int64', 'float64'])),
    ('cat', categorical_pipeline, selector(dtype_include=['object']))
])

# Feature Selection:

# Training:

In [18]:
full_pipeline = Pipeline(steps=[
    ('drop_high_nan', DropHighNaN()),
    ('preprocessing', preprocessor),
    ('regressor', Ridge())
])

param_grid = {
    'drop_high_nan__threshold': [0.1, 0.2, 0.5, 0.8],
    'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}
grid_search = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error')

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_



# Logging:

In [None]:
import dagshub
import mlflow

dagshub.init(repo_owner='ashar-22', repo_name='hw01ml', mlflow=True)

with mlflow.start_run():
    y_pred = best_model.predict(X_valid)
    rmse = mean_squared_error(np.log1p(y_valid), np.log1p(y_pred), squared=False)

    mlflow.log_param("best_alpha", grid_search.best_params_['regressor__alpha'])
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(best_model, "ridge_regressor_model")

    print(f"Best alpha: {grid_search.best_params_['regressor__alpha']}")
    print(f"Validation RMSE (log1p): {rmse:.4f}")



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=fd0db4eb-3c3c-448a-9458-e012f339b7de&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=c9090b8895aaad8e0e1f44feaeb92dde7c6d62ce1e2dbfea4b1dd127d944cef8




Output()

In [19]:
df = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv") 

ids = df["Id"]

model = best_model

predictions = model.predict(df)

submission = pd.DataFrame({
    "Id": ids,
    "SalePrice": predictions
})

submission.to_csv("/kaggle/working/submission.csv", index=False)

print("✅ submission.csv saved!")

✅ submission.csv saved!
