# Welcome to Colab!

In [1]:
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder, RobustScaler  # good for outliers
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

# For nicer plots
%matplotlib inline
sns.set_style("whitegrid")

In [3]:
from google.colab import files
uploaded1 = files.upload()
uploaded2 = files.upload()
file_name1 = list(uploaded1.keys())[0]
file_name2 = list(uploaded2.keys())[0]
# Load both files (adjust path if needed)
train = pd.read_csv(file_name1)   # or your local path
test  = pd.read_csv(file_name2)


Saving train (2).csv to train (2).csv


Saving test (2).csv to test (2).csv


In [4]:
train = pd.read_csv(file_name1)
test  = pd.read_csv(file_name2)

# Save IDs for submission
test_ids = test['Id'].copy()

# Log transform target (very important!)
train['SalePrice'] = np.log1p(train['SalePrice'])  # log(1 + x) handles 0s nicely

# Combine train + test for consistent preprocessing
all_data = pd.concat([train.drop('SalePrice', axis=1), test], axis=0, ignore_index=True)

# Missing value handling (simple but effective start)
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in ['MasVnrType', 'BsmtQual', 'BsmtCond', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'FireplaceQu']:
    all_data[col] = all_data[col].fillna('None')
for col in ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageCars', 'GarageArea']:
    all_data[col] = all_data[col].fillna(0)
all_data['Electrical'] = all_data['Electrical'].fillna('SBrkr')

# Feature engineering examples (add more!)
all_data['TotalSF'] = all_data['1stFlrSF'] + all_data['2ndFlrSF'] + all_data['TotalBsmtSF']
all_data['YrSoldRemod'] = all_data['YearRemodAdd'] - all_data['YearBuilt']
all_data['HasPool'] = (all_data['PoolArea'] > 0).astype(int)
all_data['Has2ndFloor'] = (all_data['2ndFlrSF'] > 0).astype(int)

# Drop useless/high-missing
all_data = all_data.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1)  # add more if needed

# Encoding categoricals
cat_cols = all_data.select_dtypes(include='object').columns
all_data[cat_cols] = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit_transform(all_data[cat_cols])

# Split back
train_proc = all_data.iloc[:len(train)].copy()
train_proc['SalePrice'] = train['SalePrice']
test_proc  = all_data.iloc[len(train):].copy()

X = train_proc.drop('SalePrice', axis=1)
y = train_proc['SalePrice']

In [7]:
# Create a version of the model **without** early stopping for CV
cv_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,           # higher number since no early stopping
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
    # NO early_stopping_rounds here!
)

cv_scores = -cross_val_score(
    cv_model,
    X, y,
    cv=5,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

print(f"CV RMSE: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

CV RMSE: 0.1269 ± 0.0105


In [9]:
test_preds_log = model.predict(test_proc)
test_preds = np.expm1(test_preds_log)  # back to dollars

submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_preds
})
submission.to_csv('submission_xgb_basic.csv', index=False)
print("Submission ready!")

Submission ready!
