In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
import xgboost as xgbregressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [37]:
# Load dataset
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [38]:
# Keep only numerical columns
train_data = train_data.select_dtypes(include= ['number'])
test_data = test_data.select_dtypes(include=['number'])

In [39]:
train_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [40]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf

In [6]:
train_data.shape

(1460, 38)

In [7]:
test_data.shape

(1459, 37)

In [8]:
print(train_data.isnull().sum()[train_data.isnull().sum() > 1])

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64


In [14]:
#imputing missing values

from sklearn.impute import SimpleImputer
import pandas as pd

imputer = SimpleImputer(strategy='median')
X_train = train_data.drop(columns=['SalePrice'])
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
test_data_imputed = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)




In [18]:
X_train_imputed.shape

(1460, 37)

In [19]:
test_data_imputed.shape

(1459, 37)

In [20]:
y = train_data['SalePrice']

In [21]:
#train-test split
from sklearn.model_selection import train_test_split

X_train_imputed, X_val, y_train, y_val = train_test_split(X_train_imputed, y, test_size=0.2, random_state=42)



In [28]:
# Model and GridSearchCV
regressor = xgb.XGBRegressor(eval_metric='rmsle')  # Consider changing rmsle to rmse
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

search = GridSearchCV(regressor, param_grid, cv=5).fit(X_train_imputed, y_train)
print("The best hyperparameters are ", search.best_params_)


The best hyperparameters are  {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}


In [29]:
# Training the final model
regressor = xgb.XGBRegressor(learning_rate=search.best_params_["learning_rate"],
                             n_estimators=search.best_params_["n_estimators"],
                             max_depth=search.best_params_["max_depth"],
                             eval_metric='rmse')
regressor.fit(X_train_imputed, y_train)

In [30]:
# Predictions
predictions = regressor.predict(test_data_imputed)

In [35]:
predictions.shape

(1459,)

In [32]:
submission = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': predictions
})
submission.to_csv('submission.csv', index=False)

In [33]:
print(submission.head())

     Id      SalePrice
0  1461  127331.804688
1  1462  163812.031250
2  1463  182618.406250
3  1464  191492.218750
4  1465  195904.640625
