In [3]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Assume all object columns are categorical, encode them with Label Encoding
for col in train.columns:
    if train[col].dtype == 'object':
        le = LabelEncoder()
        # Concatenate train and test data, fit LabelEncoder, then transform separately
        concatenated_data = pd.concat([train[col], test[col]]).fillna('')
        le.fit(concatenated_data)
        train[col] = le.transform(train[col].fillna(''))
        test[col] = le.transform(test[col].fillna(''))

# Split the data into training and validation sets
X = train.drop('SalePrice', axis=1)
y = train['SalePrice']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost model
model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=3
)

# Fit the model
model.fit(
    X_train, y_train,
    early_stopping_rounds=5,
    eval_set=[(X_valid, y_valid)],
    verbose=False
)

# Predict on the validation set
y_pred = model.predict(X_valid)

# Compute and print the metrics
mse = mean_squared_error(y_valid, y_pred)
mae = mean_absolute_error(y_valid, y_pred)
r2 = r2_score(y_valid, y_pred)

print(f'Mean Squared Error: {mse:.4f}')
print(f'Mean Absolute Error: {mae:.4f}')
print(f'R^2: {r2:.4f}')

# Predict on the test set
test_predictions = model.predict(test)




Mean Squared Error: 768550701.1355
Mean Absolute Error: 17347.5456
R^2: 0.8998
