# 01. Load Dataset

In [None]:
import pandas as pd

train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
train.head()

In [None]:
train.info()

# 02. Preprocessing

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
num_cols_with_nan = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

cat_cols_with_nan = ['Alley','MasVnrType','BsmtQual','BsmtCond','BsmtExposure',
                     'BsmtFinType1','BsmtFinType2','FireplaceQu','GarageType',
                     'GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature','Electrical']

In [None]:
num_imputer = SimpleImputer(strategy='median')
train[num_cols_with_nan] = num_imputer.fit_transform(train[num_cols_with_nan])

In [None]:
cat_imputer = SimpleImputer(strategy='most_frequent')
train[cat_cols_with_nan] = cat_imputer.fit_transform(train[cat_cols_with_nan])

In [None]:
print(train[num_cols_with_nan + cat_cols_with_nan].isnull().sum())

# 03. Scaling & Encoding

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
numeric_cols = train.select_dtypes(include=['int64','float64']).columns.tolist()
numeric_cols.remove('SalePrice')

categorical_cols = train.select_dtypes(include=['object']).columns.tolist()

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_cols)
])

In [None]:
X = train.drop('SalePrice', axis=1)
y = train['SalePrice']

In [None]:
X_processed = preprocessor.fit_transform(X)

In [None]:
print("Shape after Scaling & Encoding:", X_processed.shape)

# 04. PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, random_state=42)

In [None]:
X_pca = pca.fit_transform(X_processed)

In [None]:
print("Shape after PCA:", X_pca.shape)

# 05. Train / Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [None]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# 06. Linear Regression + Ridge + Lasso

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "Lasso": Lasso(alpha=0.01, random_state=42)
}

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"{name} -> RMSE: {rmse:.2f}, R2: {r2:.4f}")

# 07. Random Forest + XGBoost

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
adv_models = {
    "RandomForest": RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42)
}

In [None]:
for name, model in adv_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"{name} -> RMSE: {rmse:.2f}, R2: {r2:.4f}")

# 08. Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

xgb_model = XGBRegressor(random_state=42)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [None]:
random_search.fit(X_train, y_train)

In [None]:
print("Best Parameters:", random_search.best_params_)
best_model = random_search.best_estimator_
y_pred_best = best_model.predict(X_test)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)
print(f"Tuned XGBoost -> RMSE: {rmse_best:.2f}, R2: {r2_best:.4f}")

# 09. Feature Importance

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
importances = best_model.feature_importances_

indices = np.argsort(importances)[::-1]

In [None]:
plt.figure(figsize=(12,6))
plt.title("Feature Importance of PCA Components (XGBoost)")
plt.bar(range(len(importances)), importances[indices], align='center')
plt.xticks(range(len(importances)), [f'PC{i+1}' for i in indices], rotation=90)
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

# 10. Save the Model

In [None]:
import joblib

joblib.dump(best_model, "xgboost_house_prices_pca_model.joblib")