In [None]:
#libraries
import numpy as np
import pandas as pd
import os

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

In [None]:
#data paths
train_file_path = "/kaggle/input/playground-series-s5e2/train.csv"
extra_train_file_path = "/kaggle/input/playground-series-s5e2/training_extra.csv"
test_file_path = "/kaggle/input/playground-series-s5e2/test.csv"
sample_submission_file_path = "/kaggle/input/playground-series-s5e2/sample_submission.csv"

In [None]:
#read in data
train_df = pd.read_csv(train_file_path)
extra_train_df = pd.read_csv(extra_train_file_path)
test_df = pd.read_csv(test_file_path)

In [None]:
#visualize data
extra_train_df.head(5)
train_df.head(5)
train_df.info()
train_df.shape
extra_train_df.shape
test_df.shape

In [None]:
def one_hot_encoding(df: pd.DataFrame) -> pd.DataFrame:
    """
    Perform one-hot encoding on categorical columns in a DataFrame.
    
    Args:
        df (pd.DataFrame): Input DataFrame with categorical columns
        
    Returns:
        pd.DataFrame: DataFrame with categorical columns one-hot encoded
    """
    # Identify categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    if not categorical_cols.empty:
        print(f"Processing categorical columns:\n{list(categorical_cols)}")
        
        # Create copy to avoid modifying original data
        df_processed = df.copy()
        
        # Handle missing values
        df_processed[categorical_cols] = df_processed[categorical_cols].fillna('missing')
        
        # Initialize and fit encoder
        encoder = OneHotEncoder(
            sparse_output=False,
            handle_unknown='ignore',
            drop=None
        )
        
        # Perform encoding
        encoded_features = encoder.fit_transform(df_processed[categorical_cols])
        encoded_feature_names = encoder.get_feature_names_out(categorical_cols)
        
        # Create DataFrame with encoded features
        encoded_df = pd.DataFrame(
            encoded_features,
            columns=encoded_feature_names,
            index=df_processed.index
        )
        
        # Combine with original numeric columns
        result_df = pd.concat(
            [df_processed.drop(categorical_cols, axis=1), encoded_df],
            axis=1
        )
        
        # Verify data integrity
        assert len(result_df) == len(df), "Row count changed after encoding"
        return result_df
    
    return df  # Return original if no categorical columns

In [None]:
def imputation(X_train, X_valid, test):
    """
    Perform missing value imputation on training, validation and test datasets.
    
    Args:
        X_train (pd.DataFrame): Training dataset
        X_valid (pd.DataFrame): Validation dataset
        test (pd.DataFrame): Test dataset
        
    Returns:
        tuple: Imputed training, validation and test datasets
    """
    # Initialize imputer with default strategy (mean)
    my_imputer = SimpleImputer()
    
    # Fit on training data and transform all datasets
    imputed_datasets = []
    for dataset in [X_train, X_valid, test]:
        if dataset is X_train:
            # Fit and transform on training data
            imputed = pd.DataFrame(my_imputer.fit_transform(dataset))
        else:
            # Only transform validation and test data
            imputed = pd.DataFrame(my_imputer.transform(dataset))
        
        # Preserve original column names
        imputed.columns = dataset.columns
        imputed_datasets.append(imputed)
    
    return tuple(imputed_datasets)

In [None]:
new_train = pd.concat([train_df, extra_train_df], axis=0, ignore_index=True)
print(new_train.shape)
new_train.head(5)

In [None]:
y = new_train.Price
X = new_train.drop(['Price'], axis=1)

print(X.shape)
print(y.shape)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [None]:
print(X_train.shape)
new_X_train = one_hot_encoding(X_train)
new_X_valid = one_hot_encoding(X_valid)
new_test_df = one_hot_encoding(test_df)
print(new_X_train.shape)

final_X_train, final_X_valid, final_test_df = imputation(new_X_train, new_X_valid, new_test_df)

In [None]:
model = XGBRegressor(
    n_estimators=5000, 
    learning_rate=0.01,
    early_stopping_rounds=100
)

model.fit(final_X_train, y_train, 
         eval_set=[(final_X_valid, y_valid)], 
         verbose=True
    )

In [None]:
pred = model.predict(final_X_valid)
mae = mean_absolute_error(y_valid, pred)
print(f"MAE: {mae}")

In [None]:
final_prediction = model.predict(final_test_df)