In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   -------------------------------------- 124.9/124.9 MB 867.9 kB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.1.1



[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb
import warnings

In [4]:
warnings.filterwarnings('ignore')


In [5]:
train = pd.read_csv(r"C:\Users\YAHYA TABBASSI\Desktop\house pricing\train.csv")
test = pd.read_csv(r"C:\Users\YAHYA TABBASSI\Desktop\house pricing\test.csv")

In [6]:
# Target variable
y = train['SalePrice']

In [7]:
# Dropping the 'Id' column from train and test data
train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test_ids = test['Id']
test.drop('Id', axis=1, inplace=True)

In [8]:
# Combine train and test data for preprocessing
data = pd.concat([train, test], sort=False)

In [9]:
# Handling missing values
# Fill numeric columns with the median
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

In [10]:
# Fill categorical columns with the mode
categorical_cols = data.select_dtypes(include=[object]).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])


In [11]:
# Label Encoding for categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [12]:
# Splitting the data back into train and test sets
train_data = data[:len(y)]
test_data = data[len(y):]


In [13]:
# Feature Scaling (Standardization)
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

In [14]:
# Split the training data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_data, y, test_size=0.2, random_state=42)


In [15]:
# XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror')


In [16]:
# Hyperparameter tuning using Grid Search
param_grid = {
    'n_estimators': [500, 1000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, device=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, m...
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None,
                                    multi_strategy=None, n_estimators=None,
                                    n_jobs=None, num_parallel_tree=None,
        

In [18]:
# Best model
best_model = grid_search.best_estimator_

In [19]:
# Validation predictions
y_pred = best_model.predict(X_valid)

In [20]:
# Evaluation: RMSE
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f'Validation RMSE: {rmse}')

Validation RMSE: 25396.503961609033


In [21]:
# Training on full training data
best_model.fit(train_data, y)


XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.7, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.05, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=3, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=1000, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

In [22]:
# Predictions on test data
test_predictions = best_model.predict(test_data)

In [23]:
# Preparing the submission file
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})
submission.to_csv(r"C:\Users\YAHYA TABBASSI\Desktop\house pricing\submission.csv", index=False)
print("Submission file created successfully.")

Submission file created successfully.
