In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [46]:
warnings.filterwarnings('ignore')

In [47]:
def load_data(train_path='train.csv', test_path='test.csv'):
    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)
    return df_train, df_test

In [48]:
def handle_missing_values(df, threshold=600):
    initial_shape = df.shape
    cols_to_drop = [col for col in df.columns if df[col].isnull().sum() > threshold]
    if cols_to_drop:
        df.drop(columns=cols_to_drop, inplace=True)
        print(f"Dropped columns with more than {threshold} null values: {cols_to_drop}")
    print(f"Shape after dropping high-null columns: {df.shape}")
    
    numeric_cols = df.select_dtypes(include=np.number).columns
    for col in numeric_cols:
        if df[col].isnull().any():
            skewness = df[col].skew()
            fill_value = df[col].median() if abs(skewness) >= 0.5 else df[col].mean()
            df[col] = df[col].fillna(fill_value)


    categorical_cols = df.select_dtypes(include='object').columns
    for col in categorical_cols:
        if df[col].isnull().any():
            mode_value = df[col].mode()[0] if not df[col].mode().empty else "Unknown"
            df[col] = df[col].fillna(mode_value)

    print("Missing values in numerical and categorical columns handled.")
    return df

In [49]:
def feature_engineering(df):
    if 'Id' in df.columns:
        df = df.drop(columns=['Id'])
        print("Dropped 'Id' column.")

    numerical_features = df.select_dtypes(include=np.number).columns.tolist()
    if 'SalePrice' in numerical_features:
        numerical_features.remove('SalePrice')

    corr_matrix = df[numerical_features].corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop_high_corr = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]

    if to_drop_high_corr:
        df.drop(columns=to_drop_high_corr, inplace=True)
        print(f"Dropped highly correlated numerical features (corr > 0.8): {to_drop_high_corr}")

    print(f"Shape after feature engineering: {df.shape}")
    return df

In [50]:
def preprocess_features(X_train_df, X_test_df):
    numerical_features = X_train_df.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X_train_df.select_dtypes(include='object').columns.tolist()

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features),
        ],
        remainder='drop'
    )

    X_train_transformed = preprocessor.fit_transform(X_train_df)
    X_test_transformed = preprocessor.transform(X_test_df)

    print(f"Transformed training data shape: {X_train_transformed.shape}")
    print(f"Transformed test data shape: {X_test_transformed.shape}")
    return X_train_transformed, X_test_transformed, preprocessor

In [51]:
def evaluate_model(y_true, y_pred, set_name=""):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"{set_name} evaluation")
    print(f"  Mean Absolute Error (MAE): {mae:.4f}")
    print(f"  Mean Squared Error (MSE): {mse:.4f}")
    print(f"  Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"  R2 Score: {r2:.4f}")

In [52]:
def train_and_evaluate_models(X_train, y_train, X_test, y_test, models):
    print("\n" + "="*50)
    print("Starting initial model training and evaluation...")
    print("="*50)
    for name, model in models.items():
        print(f"\nModel: {name}")
        model.fit(X_train, y_train)

        y_test_pred = model.predict(X_test)
        evaluate_model(y_test, y_test_pred, "Test set")

        y_train_pred = model.predict(X_train)
        evaluate_model(y_train, y_train_pred, "Train set")
        print("-" * 50)

In [53]:
def tune_hyperparameters(X_train, y_train, tunable_models_params):
    print("\n" + "="*50)
    print("Starting hyperparameter tuning with RandomizedSearchCV...")
    print("="*50)
    model_best_params = {}
    for name, model, params in tunable_models_params:
        print(f"\nTuning {name}...")
        random_search = RandomizedSearchCV(model, params, cv=5, n_iter=10, n_jobs=-1, random_state=42, verbose=1)
        random_search.fit(X_train, y_train)
        best_params = random_search.best_params_
        model_best_params[name] = best_params
        print(f"Best parameters for {name}: {best_params}")
    return model_best_params

In [54]:
def train_best_models_and_predict(X_train, y_train, X_test_processed, test_ids, best_params_dict):
    print("\n" + "="*50)
    print("Training best models and generating predictions...")
    print("="*50)

    best_xgboost_model = XGBRegressor(
        max_depth=best_params_dict.get('XG-boost', {}).get('max_depth', 5),
        learning_rate=best_params_dict.get('XG-boost', {}).get('learning_rate', 0.01),
        n_estimators=best_params_dict.get('XG-boost', {}).get('n_estimators', 300),
        gamma=best_params_dict.get('XG-boost', {}).get('gamma', 0.1),
        subsample=best_params_dict.get('XG-boost', {}).get('subsample', 0.6),
        colsample_bytree=best_params_dict.get('XG-boost', {}).get('colsample_bytree', 0.8),
        reg_alpha=best_params_dict.get('XG-boost', {}).get('reg_alpha', 0.1),
        reg_lambda=best_params_dict.get('XG-boost', {}).get('reg_lambda', 1.5),
        n_jobs=-1,
        min_child_weight=best_params_dict.get('XG-boost', {}).get('min_child_weight', 5)
    )

    models_to_predict = {
        'XG-boost': best_xgboost_model,
    }

    for name, model in models_to_predict.items():
        print(f"Fitting {name} model for final prediction...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test_processed)

        output = pd.DataFrame({
            'Id': test_ids,
            'SalePrice': y_pred
        })
        output_filename = f'{name.replace(" ", "_").lower()}_submission.csv'
        output.to_csv(output_filename, index=False)
        print(f"✅ Predictions for {name} saved as '{output_filename}'")

In [None]:
def main():
    df_train, df_final_test_raw = load_data()

    final_test_ids = df_final_test_raw['Id']

    y_train_target = df_train['SalePrice']
    X_train_features = df_train.drop(['SalePrice'], axis=1)

    X_final_test_features = df_final_test_raw.drop(['Id'], axis=1)

    X_train_features = handle_missing_values(X_train_features.copy())
    X_final_test_features = handle_missing_values(X_final_test_features.copy())

    train_cols = X_train_features.columns
    test_cols = X_final_test_features.columns

    unique_to_train = set(train_cols) - set(test_cols)
    unique_to_test = set(test_cols) - set(train_cols)

    if unique_to_train:
        X_train_features = X_train_features.drop(columns=list(unique_to_train))
        print(f"Dropped columns unique to train set: {list(unique_to_train)}")
    if unique_to_test:
        X_final_test_features = X_final_test_features.drop(columns=list(unique_to_test))
        print(f"Dropped columns unique to test set: {list(unique_to_test)}")

    X_final_test_features = X_final_test_features[X_train_features.columns]

    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train_features, y_train_target, test_size=0.25, random_state=42
    )
    print(f"Train-validation split done. X_train_split shape: {X_train_split.shape}")

    X_train_processed, X_val_processed, preprocessor_fitted = preprocess_features(X_train_split, X_val_split)
    X_final_test_processed = preprocessor_fitted.transform(X_final_test_features)

    initial_models = {
        'Random Forest': RandomForestRegressor(random_state=42),
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(random_state=42),
        'Lasso Regression': Lasso(random_state=42),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Support Vector Regression': SVR(),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'XG-boost': XGBRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'Ada Boosting': AdaBoostRegressor(random_state=42)
    }

    train_and_evaluate_models(X_train_processed, y_train_split, X_val_processed, y_val_split, initial_models)

    rf_params = {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True, False]
    }
    gb_params = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 4, 5, 6],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.7, 0.8, 1.0],
        'max_features': ['sqrt', 'log2']
    }
    xgb_params = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.2],
        'reg_alpha': [0, 0.1, 1],
        'reg_lambda': [1, 1.5, 2]
    }

    tunable_models_params = [
        ('Gradient Boosting', GradientBoostingRegressor(random_state=42), gb_params),
        ('Random Forest', RandomForestRegressor(random_state=42), rf_params),
        ('XG-boost', XGBRegressor(random_state=42), xgb_params)
    ]

    best_params = tune_hyperparameters(X_train_processed, y_train_split, tunable_models_params)

    train_best_models_and_predict(X_train_processed, y_train_split, X_final_test_processed, final_test_ids, best_params)

if __name__ == "__main__":
    main()

Dropped columns with more than 600 null values: ['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
Shape after dropping high-null columns: (1460, 74)
Missing values in numerical and categorical columns handled.
Dropped columns with more than 600 null values: ['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
Shape after dropping high-null columns: (1459, 73)
Missing values in numerical and categorical columns handled.
Dropped columns unique to train set: ['Id']
Train-validation split done. X_train_split shape: (1095, 73)
Transformed training data shape: (1095, 225)
Transformed test data shape: (365, 225)

Starting initial model training and evaluation...

Model: Random Forest
Test set evaluation
  Mean Absolute Error (MAE): 17094.4650
  Mean Squared Error (MSE): 766765762.8639
  Root Mean Squared Error (RMSE): 27690.5356
  R2 Score: 0.8905
Train set evaluation
  Mean Absolute Error (MAE): 6700.0333
  Mean Squared Error (MSE): 144282577.9208
 