In [None]:
from xgboost import XGBRegressor
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('display.max_columns',None)
import re
from sklearn.model_selection import train_test_split, cross_val_score
import math
from sklearn.model_selection import KFold,GridSearchCV

Data loading

In [None]:
train_data = pd.read_csv("./data/train_superset.csv")
test_data = pd.read_csv("./data/test_superset.csv")

In [None]:

# Assuming your target variable is 'target' and features are in 'X' and 'y' columns
X = train_data.drop(columns=['monthly_rent'])
y = train_data['monthly_rent']


In [None]:
y.head()

0    1600
1    2400
2    1800
3    2100
4    2750
Name: monthly_rent, dtype: int64

In [None]:
# Splitting the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=100)


In [None]:
# Step 1: Build the model using all features
xg = XGBRegressor(objective='reg:squarederror', n_estimators = 100, seed = 42, eval_metric = "rmse", max_depth = 5, learning_rate = 0.01)

xg.fit(X_train, y_train)

In [None]:
# Calculate Mean Squared Error (MSE) for the training data
y_pred_train =xg.predict(X_train)  # Make predictions on the training data
y_pred_train = y_pred_train.round().astype(int) # Convert the rounded values to integers
mse_train = mean_squared_error(y_train, y_pred_train)  # Calculate MSE on the training data
rmse_train = math.sqrt(mse_train)  # Calculate RMSE from MSE

In [None]:
# Calculate Mean Squared Error (MSE) for the vali data
y_pred_val = xg.predict(X_val)  # Make predictions on the test data
y_pred_val = y_pred_val .round().astype(int) # Convert the rounded values to integers

mse_val = mean_squared_error(y_val, y_pred_val )  # Calculate MSE on the test data
rmse_val = math.sqrt(mse_val)  # Calculate RMSE from MSE


In [None]:
# Print the MSE values to assess model performance
print("RMSE on Training Data:", rmse_train)
print("RMSE on Validation Data:", rmse_val)

RMSE on Training Data: 542.2223612967128
RMSE on Validation Data: 539.5601096137003


In [None]:

# Evaluating on kaggle test data

# Use the trained model to make predictions on the test_data
y_pred_test_data = xg.predict(test_data[X_train.columns.tolist()])

y_pred_test_data

array([3067.1106, 2783.1025, 2920.165 , ..., 2629.837 , 3051.412 ,
       3126.3467], dtype=float32)

In [None]:

# Convert the rounded values to integers
y_pred_test_data = y_pred_test_data.round().astype(int)




In [None]:
import os
file = os.getcwd() + "/drive/MyDrive/NUS_Semester1_project_4_courses/CS5228_Project/SreeLakshmi/Submission_original_xg.csv"
num_predictions = len(y_pred_test_data )

# Generate unique IDs starting from 0
ids = list(range(num_predictions))

# Create a DataFrame with 'Id' and 'Predicted' columns
submission_df = pd.DataFrame({'Id': ids, 'Predicted':y_pred_test_data })

# Save the DataFrame to a CSV file
submission_df.to_csv(file, index=False)

In [None]:
submission_df

Unnamed: 0,Id,Predicted
0,0,3355
1,1,2825
2,2,3636
3,3,2075
4,4,2419
...,...,...
29995,29995,2954
29996,29996,2726
29997,29997,2677
29998,29998,3272


Feature Importance

In [None]:
fea_imp_ = pd.DataFrame({'cols':X_train.columns, 'fea_imp':xg.feature_importances_})
fea_imp_.loc[fea_imp_.fea_imp > 0].sort_values(by=['fea_imp'], ascending = False)

Unnamed: 0,cols,fea_imp
13,mean_coe,0.382264
0,flat_type,0.165707
15,min_coe,0.140427
43,planning_area_encoded,0.047591
2,property_age,0.044141
42,subzone_encoded,0.037149
16,last_month_mean,0.032851
10,school_within_2_km,0.015226
1,floor_area_sqm,0.014963
22,region_North Region,0.014182


### Cross Validation

In [None]:
#total number of columns in original dataset
len(X_train.columns)

44

In [None]:
# Perform 10-Fold Cross-Validation based on RMSE
# Step 1: Build the model using all features
xg = XGBRegressor(objective='reg:squarederror', n_estimators = 100, seed = 42, eval_metric = "rmse", max_depth = 5, learning_rate = 0.01)

cv_scores = cross_val_score(xg, X, y, cv=10, scoring='neg_root_mean_squared_error')

print(" 10-Fold Cross-Validation RMSE %0.2f with a standard deviation of %0.2f"% (cv_scores.mean(),cv_scores.std()))


 10-Fold Cross-Validation RMSE -546.40 with a standard deviation of 76.51


In [None]:
# Perform 10-Fold Cross-Validation based on R2
# Step 1: Build the model using all features

cv_scores = cross_val_score(xg, X, y, cv=10, scoring='r2')

print(" 10-Fold Cross-Validation R2 %0.2f with a standard deviation of %0.2f"% (cv_scores.mean(),cv_scores.std()))

 10-Fold Cross-Validation R2 0.35 with a standard deviation of 0.07


### Optimal Feature Number Tuning

In [None]:
# Create a cross-validation scheme
folds = KFold(n_splits=5, shuffle=True, random_state=100)

# Specify range of hyperparameters to tune
param_grid = {
    'n_features_to_select': [44,42, 40, 35, 38, 30, 32, 25,20,15]
}

xg = XGBRegressor(objective='reg:squarederror', n_estimators = 100, seed = 42, eval_metric = "rmse", max_depth = 5, learning_rate = 0.01)
xg.fit(X_train, y_train)

rfe = RFE(estimator = xg)           # selecting top 15 features
# GridSearchCV for negative mean squared error (MSE)
model_cv_mse = GridSearchCV(estimator=rfe,
                            param_grid=param_grid,
                            scoring='neg_root_mean_squared_error',
                            cv=folds,
                            verbose=1,
                            return_train_score=True)
model_cv_mse.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
# Get the best number of selected features corresponding RMSE scores

best_features_mse = model_cv_mse.best_params_['n_features_to_select']
best_neg_mse_score = model_cv_mse.best_score_

# Print the results
print(f"Best number of features for Neg RMSE: {best_features_mse}, Best Neg RMSE Score: {best_neg_mse_score}")

Best number of features for Neg RMSE: 20, Best Neg RMSE Score: -544.4572524010119


In [None]:
# final model
n_features_optimal = 20

xg =  XGBRegressor(objective='reg:squarederror', n_estimators = 100, seed = 42, eval_metric = "rmse", max_depth = 5, learning_rate = 0.01)
xg.fit(X_train, y_train)

rfe = RFE(xg, n_features_to_select=n_features_optimal)
rfe = rfe.fit(X_train, y_train)


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [None]:
# Get the selected features
selected_features_20 = X_train.columns[rfe.support_]

In [None]:
X_train.shape, X_val.shape

((38064, 20), (9517, 20))

In [None]:
# Get the selected features


# Use the selected features for training and validation
X_train_selected_20 = X_train[selected_features_20]
X_val_selected_20 = X_val[selected_features_20]

#kaggle_test_data
ktest_data = test_data[X_train.columns.tolist()]
ktest_data_20  = ktest_data[selected_features_20]

# Print the selected features
print("Selected Features:")
print(selected_features_20)

Selected Features:
Index(['flat_type', 'floor_area_sqm', 'property_age', 'nearest_mrt_planned',
       'mrt_within_0.5_km', 'mrt_within_1_km', 'school_within_1_km',
       'school_within_2_km', 'mean_coe', 'max_coe', 'min_coe',
       'last_month_mean', 'last_month_max', 'region_East Region',
       'region_North Region', 'flat_model_dbss', 'flat_model_model_a',
       'flat_model_type_s1_s2', 'subzone_encoded', 'planning_area_encoded'],
      dtype='object')


Hyper-Parameter Tuning :

In [None]:
# Importing Random search
from sklearn.model_selection import RandomizedSearchCV

# Setting out parameter values
params = { 'max_depth': [3,4,5],
          'min_child_weight': [1, 5, 10],
           'learning_rate': [ 0.01,0.1,0.2,0.5],
           'subsample': [0.6, 0.8, 1.0],
           'n_estimators': [100,200,300],
           'colsample_bytree': [0.5,0.6,0.8],
           'eval_metric': ['rmse'],
           'objective': ['reg:squarederror']}

# Initiating XGboost regressor
xgbr = XGBRegressor(seed = 20)
# Initiating GridSearch CV
clf = RandomizedSearchCV(xgbr,
                    param_distributions=params,n_iter = 100,
                   scoring='neg_mean_squared_error',
                   verbose=-1)
# Fitting GridSearch to our training data
clf.fit(X_train_selected_20, y_train)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

In [None]:
# Builld the Model for the selected features
xg = XGBRegressor(objective='reg:squarederror', n_estimators = 200, seed = 42, eval_metric = "rmse", max_depth = 5, learning_rate = 0.1, min_child_weight = 10,colsample_bytree =0.6,subsample =1.0)
xg.fit(X_train_selected_20, y_train)


In [None]:
# Calculate Mean Squared Error (MSE) for the training data
y_pred_train_20 = xg.predict(X_train_selected_20)  # Make predictions on the training data
y_pred_train_20 = y_pred_train_20.round().astype(int) # Convert the rounded values to integers
mse_train = mean_squared_error(y_train, y_pred_train_20)  # Calculate MSE on the training data
rmse_train_20 = math.sqrt(mse_train)  # Calculate RMSE from MSE

In [None]:
# Calculate Mean Squared Error (MSE) for the test data
y_pred_val_20 = xg.predict(X_val_selected_20)  # Make predictions on the test data
y_pred_val_20 = y_pred_val_20.round().astype(int) # Convert the rounded values to integers
mse_test = mean_squared_error(y_val, y_pred_val_20)  # Calculate MSE on the test data
rmse_val_20 = math.sqrt(mse_test)  # Calculate RMSE from MSE

In [None]:
# Print the MSE values to assess model performance
print("RMSE on Training Data:", rmse_train_20)
print("RMSE on Validation Data:", rmse_val_20)

RMSE on Training Data: 465.7887886912369
RMSE on Validation Data: 485.2374893142952


In [None]:
from sklearn.metrics import r2_score

# Calculate R-squared (R2) for the training data
r2_train_20 = r2_score(y_train, y_pred_train_20)

# Calculate R-squared (R2) for the test data
r2_val_20 = r2_score(y_val, y_pred_val_20)

# Print the R2 scores
print("R-squared (R2) for training data:", r2_train_20)
print("R-squared (R2) for validation data:", r2_val_20)

R-squared (R2) for training data: 0.5774566587107781
R-squared (R2) for validation data: 0.54034843890878


In [None]:

# Evaluating on kaggle test data

# Use the trained model to make predictions on the test_data
y_pred_test_data_20 = xg.predict(test_data[X_train_selected_20.columns.tolist()])

y_pred_test_data_20

array([3280.4746, 2747.0435, 3414.8613, ..., 2693.2627, 3286.19  ,
       3491.1501], dtype=float32)

In [None]:

# Convert the rounded values to integers
y_pred_test_data_20 = y_pred_test_data_20.round().astype(int)




Kaggle Test RMSE : 483.97

In [None]:
# Save it in a file
import os
file = os.getcwd() + "/drive/MyDrive/NUS_Semester1_project_4_courses/CS5228_Project/SreeLakshmi/Submission_original_xg_20_final.csv"
num_predictions = len(y_pred_test_data )

# Generate unique IDs starting from 0
ids = list(range(num_predictions))

# Create a DataFrame with 'Id' and 'Predicted' columns
submission_df = pd.DataFrame({'Id': ids, 'Predicted':y_pred_test_data_20 })

# Save the DataFrame to a CSV file
submission_df.to_csv(file, index=False)

Feature Importance

In [None]:
fea_imp_ = pd.DataFrame({'cols':X_train_selected_20.columns, 'fea_imp':xg.feature_importances_})
fea_imp_.loc[fea_imp_.fea_imp > 0].sort_values(by=['fea_imp'], ascending = False)

Unnamed: 0,cols,fea_imp
8,mean_coe,0.239359
0,flat_type,0.167179
10,min_coe,0.159807
17,flat_model_type_s1_s2,0.082008
2,property_age,0.055274
9,max_coe,0.048522
18,subzone_encoded,0.038733
19,planning_area_encoded,0.034736
1,floor_area_sqm,0.031745
15,flat_model_dbss,0.03005


In [None]:
# Perform 10-Fold Cross-Validation
# Build the model using selected features
xg = XGBRegressor(objective='reg:squarederror', n_estimators = 200, seed = 42, eval_metric = "rmse", max_depth = 5, learning_rate = 0.1, min_child_weight = 10,colsample_bytree =0.6,subsample =1.0)


In [None]:
# Cross-Validation with RMSE
cv_scores = cross_val_score(xg, X_train_selected_20, y_train, cv=10, scoring='neg_root_mean_squared_error')

print(" 10-Fold Cross-Validation RMSE %0.2f with a standard deviation of %0.2f"% (cv_scores.mean(),cv_scores.std()))

 10-Fold Cross-Validation RMSE -490.58 with a standard deviation of 5.47


In [None]:
#Cross-Validation with RMSE R2
cv_scores = cross_val_score(xg, X_train_selected_20, y_train, cv=10, scoring='r2')

print(" 10-Fold Cross-Validation R2 score %0.2f with a standard deviation of %0.2f"% (cv_scores.mean(),cv_scores.std()))

 10-Fold Cross-Validation R2 score 0.53 with a standard deviation of 0.01
