In [26]:
!pip install scikit-learn xgboost





[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [27]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

In [28]:

# 1️⃣ Load data
train_df = pd.read_csv('train(1).csv')
test_df = pd.read_csv('test(1).csv')

# Check data
print(train_df.shape)
print(train_df.columns)

(1460, 81)
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 

In [29]:
# 2️⃣ Separate features and target
X = train_df.drop(['SalePrice', 'Id'], axis=1)
y = train_df['SalePrice']

In [30]:
# 3️⃣ Handle categorical data (simple encoding for demo)
X = pd.get_dummies(X)
test_df = pd.get_dummies(test_df)

In [31]:
# Align train and test columns
X, test_df = X.align(test_df, join='left', axis=1, fill_value=0)

In [32]:
# 4️⃣ Split train/test
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [33]:
# 5️⃣ Train multiple models and evaluate
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, verbosity=0)
}


In [34]:
# Check shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Check types
print("y_train dtype:", y_train.dtype)

# Convert y_train if needed
y_train = pd.to_numeric(y_train, errors='coerce')
y_valid = pd.to_numeric(y_valid, errors='coerce')


X_train shape: (1168, 287)
y_train shape: (1168,)
y_train dtype: int64


In [35]:
print("Number of NaNs in y_train:", y_train.isnull().sum())
print("Number of NaNs in y_valid:", y_valid.isnull().sum())


Number of NaNs in y_train: 0
Number of NaNs in y_valid: 0


In [36]:
# Fill numeric NaNs
num_cols = X_train.select_dtypes(include=[np.number]).columns
X_train[num_cols] = X_train[num_cols].fillna(X_train[num_cols].median())
X_valid[num_cols] = X_valid[num_cols].fillna(X_train[num_cols].median())

# Fill categorical NaNs (only if exist)
obj_cols = X_train.select_dtypes(include=['object']).columns
if len(obj_cols) > 0:
    mode_values = X_train[obj_cols].mode().iloc[0]
    X_train[obj_cols] = X_train[obj_cols].fillna(mode_values)
    X_valid[obj_cols] = X_valid[obj_cols].fillna(mode_values)


In [37]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    mae = mean_absolute_error(y_valid, preds)
    r2 = r2_score(y_valid, preds)
    
    print(f"----- {name} -----")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R^2: {r2:.4f}\n")


----- Random Forest -----
RMSE: 29215.10
MAE: 17622.33
R^2: 0.8887

----- Gradient Boosting -----
RMSE: 27251.56
MAE: 17117.23
R^2: 0.9032

----- XGBoost -----
RMSE: 26173.45
MAE: 16685.09
R^2: 0.9107



In [38]:
## Hyperparameter tuning (GridSearchCV / RandomizedSearchCV)
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

# Define parameter grid
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb = XGBRegressor(random_state=42)
random_search = RandomizedSearchCV(
    xgb, param_distributions=param_dist,
    n_iter=20, scoring='neg_root_mean_squared_error',
    cv=3, verbose=2, n_jobs=-1, random_state=42
)

random_search.fit(X_train, y_train)

print("Best params:", random_search.best_params_)
print("Best RMSE on CV:", -random_search.best_score_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params: {'subsample': 0.6, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Best RMSE on CV: 26643.048828125


In [39]:
## Retrain the best XGBoost with best params
best_xgb = random_search.best_estimator_
best_xgb.fit(X_train, y_train)
final_preds = best_xgb.predict(X_valid)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
rmse = np.sqrt(mean_squared_error(y_valid, final_preds))
mae = mean_absolute_error(y_valid, final_preds)
r2 = r2_score(y_valid, final_preds)

print(f"Final tuned model RMSE: {rmse:.2f}")
print(f"Final tuned model MAE: {mae:.2f}")
print(f"Final tuned model R^2: {r2:.4f}")


Final tuned model RMSE: 25692.91
Final tuned model MAE: 15391.57
Final tuned model R^2: 0.9139


In [40]:
import joblib
joblib.dump(best_xgb, 'best_xgboost_model.pkl')


['best_xgboost_model.pkl']

In [41]:
# Suppose median of y_valid as threshold
threshold = y_valid.median()

# Convert to binary classes
y_valid_class = (y_valid > threshold).astype(int)
preds_class = (final_preds > threshold).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Accuracy:", accuracy_score(y_valid_class, preds_class))
print("Precision:", precision_score(y_valid_class, preds_class))
print("Recall:", recall_score(y_valid_class, preds_class))
print("F1-score:", f1_score(y_valid_class, preds_class))


Accuracy: 0.928082191780822
Precision: 0.9370629370629371
Recall: 0.9178082191780822
F1-score: 0.9273356401384083


In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Use median price in validation set as threshold
threshold = y_valid.median()

# Convert true values and predictions to classes: 1 if above threshold else 0
y_valid_class = (y_valid > threshold).astype(int)

print(f"Classification metrics (using threshold = {threshold}):")
print()

for name, model in models.items():
    # Predict continuous prices
    preds = model.predict(X_valid)
    
    # Convert predictions to classes
    preds_class = (preds > threshold).astype(int)
    
    acc = accuracy_score(y_valid_class, preds_class)
    prec = precision_score(y_valid_class, preds_class)
    rec = recall_score(y_valid_class, preds_class)
    f1 = f1_score(y_valid_class, preds_class)
    
    print(f"----- {name} -----")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print()


Classification metrics (using threshold = 154150.0):

----- Random Forest -----
Accuracy : 0.9212
Precision: 0.9241
Recall   : 0.9178
F1-score : 0.9210

----- Gradient Boosting -----
Accuracy : 0.9315
Precision: 0.9375
Recall   : 0.9247
F1-score : 0.9310

----- XGBoost -----
Accuracy : 0.9315
Precision: 0.9375
Recall   : 0.9247
F1-score : 0.9310

