## Importing libraries

In [1]:
import numpy as np 
import pandas as pd 
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from warnings import filterwarnings
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score 
filterwarnings("ignore", category=DeprecationWarning) 
filterwarnings("ignore", category=FutureWarning) 
filterwarnings("ignore", category=UserWarning)

## Loading the dataset

In [2]:
train = pd.read_csv("../input/eda-concrete-strength/Filtered_dataset.csv")
train.drop(["Unnamed: 0"],axis=1,inplace=True)
train

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
1,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90,47.03
2,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28,36.45
3,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
4,475.0,0.0,0.0,228.0,0.0,932.0,594.0,28,39.29
...,...,...,...,...,...,...,...,...,...
906,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
907,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
908,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
909,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cement              911 non-null    float64
 1   Blast Furnace Slag  911 non-null    float64
 2   Fly Ash             911 non-null    float64
 3   Water               911 non-null    float64
 4   Superplasticizer    911 non-null    float64
 5   Coarse Aggregate    911 non-null    float64
 6   Fine Aggregate      911 non-null    float64
 7   Age                 911 non-null    int64  
 8   Strength            911 non-null    float64
dtypes: float64(8), int64(1)
memory usage: 64.2 KB


In [4]:
x_train = train.drop(['Strength'],axis=1)
y_train = train['Strength']

In [5]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

In [6]:
def check_rmse(model,x_val,y_val):
    pred = model.predict(x_val)
    return np.sqrt(mean_squared_error(y_val,pred))

In [7]:
def check_r2(model,x_val,y_val):
    pred = model.predict(x_val)
    return (r2_score(y_val,pred))

In [8]:
X_train,X_val,Y_train,Y_val = train_test_split(x_train,y_train,random_state=26)

## Parameter tuning

In [9]:
def hyperParameterTuning(X_train, Y_train):
    param_tuning = {
        'objective': ['reg:squarederror'],
        'colsample_bytree': [0.5, 0.7],
        'subsample': [0.5, 0.99],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [2, 5,7],
        'min_child_weight': [0, 1, 2],
        'n_estimators' : [700, 1000]
    }

    xgb_model = XGBRegressor(tree_method='gpu_hist')

    gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,                        
                           scoring = 'neg_mean_absolute_error', #MAE
                           #scoring = 'neg_mean_squared_error',  #MSE
                           cv = 4,
                           n_jobs = -1,
                           verbose = 0)

    gsearch.fit(X_train,Y_train)

    return gsearch.best_params_


In [10]:
%%time
params = hyperParameterTuning(x_train, y_train)
params

CPU times: user 4.91 s, sys: 533 ms, total: 5.44 s
Wall time: 37min 25s


{'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'max_depth': 2,
 'min_child_weight': 2,
 'n_estimators': 700,
 'objective': 'reg:squarederror',
 'subsample': 0.5}

In [11]:
# You can modify add or delete the paramameters and their range in the function for getting best values for the dataset
# I have also added a model with parameter tuning done before hand as the more parameter you add, the more time it takes
model1 =  XGBRegressor(**params,tree_method='gpu_hist')

model2 =  XGBRegressor(learning_rate=0.01,
                           n_estimators=3000,
                           max_depth=5, 
                           min_child_weight=0,
                           gamma=0, 
                           subsample=0.7,                                
                           colsample_bytree=0.7,                                     
                           objective='reg:squarederror',                                
                           scale_pos_weight=1, 
                           seed=27,                                     
                           reg_alpha=0.00006,
                           tree_method='gpu_hist')

In [12]:
model1.fit(X_train, Y_train, early_stopping_rounds=50, eval_set=[(X_val, Y_val)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, enable_categorical=False,
             gamma=0, gpu_id=0, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=2, min_child_weight=2, missing=nan,
             monotone_constraints='()', n_estimators=700, n_jobs=2,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.5,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [13]:
model2.fit(X_train, Y_train, early_stopping_rounds=50, eval_set=[(X_val, Y_val)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
             gamma=0, gpu_id=0, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=5, min_child_weight=0, missing=nan,
             monotone_constraints='()', n_estimators=3000, n_jobs=2,
             num_parallel_tree=1, predictor='auto', random_state=27,
             reg_alpha=6e-05, reg_lambda=1, scale_pos_weight=1, seed=27,
             subsample=0.7, tree_method='gpu_hist', validate_parameters=1,
             verbosity=None)

In [14]:
print(f"Rmse for model 1 : {check_rmse(model1,X_val,Y_val)}")
print(f"Rmse for model 2 : {check_rmse(model2,X_val,Y_val)}")

Rmse for model 1 : 4.755855160235109
Rmse for model 2 : 4.551235108831921


In [15]:
print(f"R2 Score for model 1 : {check_r2(model1,X_val,Y_val)}")
print(f"R2 Score for model 2 : {check_r2(model2,X_val,Y_val)}")

R2 Score for model 1 : 0.9120688566767767
R2 Score for model 2 : 0.9194725359397369
