In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score,mean_absolute_error , mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


In [3]:
df=pd.read_csv(r'data/cement_data_cleaned.csv')
df.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


In [4]:
df.columns

Index(['Cement (component 1)(kg in a m^3 mixture)',
       'Blast Furnace Slag (component 2)(kg in a m^3 mixture)',
       'Fly Ash (component 3)(kg in a m^3 mixture)',
       'Water  (component 4)(kg in a m^3 mixture)',
       'Superplasticizer (component 5)(kg in a m^3 mixture)',
       'Coarse Aggregate  (component 6)(kg in a m^3 mixture)',
       'Fine Aggregate (component 7)(kg in a m^3 mixture)', 'Age (day)',
       'Concrete compressive strength(MPa, megapascals) '],
      dtype='object')

In [5]:
df.describe()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
count,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0
mean,278.629055,72.043134,55.535075,182.074378,6.031647,974.376468,772.686617,45.856716,35.250273
std,104.345003,86.170555,64.207448,21.34074,5.919559,77.579534,80.339851,63.734692,16.284808
min,102.0,0.0,0.0,121.75,0.0,801.0,594.0,1.0,2.331808
25%,190.68,0.0,0.0,166.61,0.0,932.0,724.3,7.0,23.523542
50%,265.0,20.0,0.0,185.7,6.1,968.0,780.0,28.0,33.798114
75%,349.0,142.5,118.27,192.94,10.0,1031.0,822.2,56.0,44.86834
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.599225


In [6]:
df.rename(columns={'Cement (component 1)(kg in a m^3 mixture)':"cement",
                   'Blast Furnace Slag (component 2)(kg in a m^3 mixture)': "blast_furance_slag",
                    'Fly Ash (component 3)(kg in a m^3 mixture)': "fly_ash",
                    'Water  (component 4)(kg in a m^3 mixture)': "water",
                    'Superplasticizer (component 5)(kg in a m^3 mixture)':"superplasticizer",
                    'Coarse Aggregate  (component 6)(kg in a m^3 mixture)' : "coarse_aggregate",
                    'Fine Aggregate (component 7)(kg in a m^3 mixture)':"fine_aggregate",
                    'Age (day)':"age_in_day",
                    'Concrete compressive strength(MPa, megapascals) ':"concrete_compressive_strenth"},inplace=True)


In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.to_csv(r"data\renamed_cement_data.csv",index=False)

In [9]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [10]:
X.shape,y.shape

((1005, 8), (1005,))

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1005 entries, 0 to 1004
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   cement              1005 non-null   float64
 1   blast_furance_slag  1005 non-null   float64
 2   fly_ash             1005 non-null   float64
 3   water               1005 non-null   float64
 4   superplasticizer    1005 non-null   float64
 5   coarse_aggregate    1005 non-null   float64
 6   fine_aggregate      1005 non-null   float64
 7   age_in_day          1005 non-null   int64  
dtypes: float64(7), int64(1)
memory usage: 62.9 KB


In [13]:
list(df.columns)

['cement',
 'blast_furance_slag',
 'fly_ash',
 'water',
 'superplasticizer',
 'coarse_aggregate',
 'fine_aggregate',
 'age_in_day',
 'concrete_compressive_strenth']

In [14]:
target_column = df.columns[-1]
print(target_column)


concrete_compressive_strenth


Here all feature is numerical so only numerical pipeline create here


In [15]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

In [16]:
num_col=X.select_dtypes(exclude='object').columns
num_col.to_list

<bound method IndexOpsMixin.tolist of Index(['cement', 'blast_furance_slag', 'fly_ash', 'water', 'superplasticizer',
       'coarse_aggregate', 'fine_aggregate', 'age_in_day'],
      dtype='object')>

In [17]:
num_pipeline=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [18]:
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)
])


In [19]:
import warnings
warnings.filterwarnings('ignore')

X_train= pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())

X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [20]:
X_train.head()

Unnamed: 0,num_pipeline__cement,num_pipeline__blast_furance_slag,num_pipeline__fly_ash,num_pipeline__water,num_pipeline__superplasticizer,num_pipeline__coarse_aggregate,num_pipeline__fine_aggregate,num_pipeline__age_in_day
0,-1.078725,2.136608,-0.861406,0.021226,1.154937,-1.446165,0.696974,-0.277746
1,-0.848293,-0.855616,1.097172,-0.957018,0.635153,1.480293,0.396097,-0.277746
2,-1.220733,-0.855616,2.033122,-0.718816,1.654731,1.300184,-1.139592,-0.277746
3,-0.578383,-0.855616,1.080744,-1.112426,0.876719,1.36022,0.301434,-0.671365
4,0.5023,-0.855616,-0.861406,0.4375,-1.010833,-0.523903,0.889258,0.698429


In [21]:
models={
                "Linear Regression":LinearRegression(),
                "Lasso":Lasso(),
                "Ridge":Ridge(),
                "Decision Tree":DecisionTreeRegressor(),
                "Random Forest Regressor":RandomForestRegressor(),
                "Adaboost Regressor":AdaBoostRegressor(),
                "GradiantBoost Regressor":GradientBoostingRegressor(),
                'XGBRegressor':XGBRegressor(),
                'CatBoostRegressor':CatBoostRegressor()

            }


In [22]:
def evalute_models(true,predicted):
    r2 = r2_score(true, predicted)
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    return r2, mae, rmse

## here we can see that Random forest regressor is best performance so we can use that for model
### and define parameters for hyper parameter tuning



In [23]:
param = {

                "Linear Regression": {
                #"fit_intercept": [True, False]
                },

                "Lasso": {
                    "alpha": [0.001, 0.01, 0.1, 1, 10],
                    #"max_iter": [1000, 5000]
                },

                "Ridge": {
                    "alpha": [0.1, 1, 10, 50],
                    "solver": ["auto", "svd", "cholesky"]
                },

                "Decision Tree": {
                    "criterion": ["squared_error", "friedman_mse"],
                    "max_depth": [None, 5,3, 10, 20],
                   # "min_samples_split": [2, 5, 10],
                    #"min_samples_leaf": [1, 2, 4]
                },

                "Random Forest Regressor": {
                    "n_estimators": [100, 200, 300],
                    "max_depth": [None, 10, 20],
                    #"min_samples_split": [2, 5],
                    #"min_samples_leaf": [1, 2],
                    "max_features": ["sqrt", "log2"]
                },

                "Adaboost Regressor": {
                    "n_estimators": [50, 100, 200],
                    "learning_rate": [0.01, 0.1, 1]
                },

                "GradiantBoost Regressor": {
                    "n_estimators": [100, 200],
                    "learning_rate": [0.01, 0.1],
                    "max_depth": [3, 5],
                    #"subsample": [0.8, 1.0]
                },

            
                'XGBRegressor' : {
                    "n_estimators": [200, 400],
                    #"learning_rate": [0.03, 0.05, 0.1],
                    "max_depth": [4, 6, 8],
                    #"subsample": [0.7, 0.8, 1.0],
                    #"colsample_bytree": [0.7, 0.8, 1.0],
                    "reg_lambda": [1, 5, 10]    
                },
                'CatBoostRegressor':{
                    #"iterations": [300, 500],
                    #"learning_rate": [0.03, 0.05, 0.1],
                    "depth": [4, 6, 8],
                    "l2_leaf_reg": [1, 3, 5, 7]
                }
                
            }

In [24]:

def evaluate_model(X_train,y_train,X_test,y_test,models,param):
        report={}
        for i in range(len(models)):
            model=list(models.values())[i]
            para=param[list(models.keys())[i]]
            #train model
            gs=GridSearchCV(model,para,cv=3)
            gs.fit(X_train,y_train)

            best_model = gs.best_estimator_

            y_test_pred = best_model.predict(X_test)

            

            y_train_pred=best_model.predict(X_train)
            

            train_model_score=r2_score(y_train,y_train_pred)
            test_model_score=r2_score(y_test,y_test_pred)

            report[list(models.keys())[i]] = test_model_score,train_model_score
        
        return report

In [25]:
model_report:dict=evaluate_model(X_train,y_train,X_test,y_test,models,param) 

0:	learn: 16.6008541	total: 160ms	remaining: 2m 39s
1:	learn: 16.2872317	total: 164ms	remaining: 1m 21s
2:	learn: 16.0416633	total: 165ms	remaining: 55s
3:	learn: 15.7991729	total: 167ms	remaining: 41.5s
4:	learn: 15.5376672	total: 168ms	remaining: 33.4s
5:	learn: 15.2860093	total: 169ms	remaining: 27.9s
6:	learn: 15.0545903	total: 170ms	remaining: 24.1s
7:	learn: 14.8270319	total: 171ms	remaining: 21.2s
8:	learn: 14.5858619	total: 172ms	remaining: 18.9s
9:	learn: 14.3740057	total: 173ms	remaining: 17.1s
10:	learn: 14.1439591	total: 174ms	remaining: 15.6s
11:	learn: 13.9561540	total: 175ms	remaining: 14.4s
12:	learn: 13.7587866	total: 177ms	remaining: 13.4s
13:	learn: 13.5497184	total: 178ms	remaining: 12.6s
14:	learn: 13.3646835	total: 180ms	remaining: 11.8s
15:	learn: 13.1887902	total: 181ms	remaining: 11.1s
16:	learn: 13.0059236	total: 182ms	remaining: 10.5s
17:	learn: 12.8255197	total: 183ms	remaining: 9.98s
18:	learn: 12.6614556	total: 185ms	remaining: 9.53s
19:	learn: 12.4978329	

In [26]:
print(model_report)

{'Linear Regression': (0.6422265413538712, 0.5944481911450792), 'Lasso': (0.6422284485537224, 0.5944477940457993), 'Ridge': (0.6422267859610482, 0.5944479109772125), 'Decision Tree': (0.8100073073094999, 0.9974603105935858), 'Random Forest Regressor': (0.9136606545960964, 0.9856596853642272), 'Adaboost Regressor': (0.7685227974930195, 0.8135266882300627), 'GradiantBoost Regressor': (0.899768878306472, 0.9938581375471723), 'XGBRegressor': (0.9262877497037159, 0.9949325224722657), 'CatBoostRegressor': (0.9402698890088388, 0.9888636094169138)}


In [28]:
best_model_score=max(sorted(model_report.values()))
best_model_name=list(model_report.keys())[list(model_report.values()).index(best_model_score)]
best_model=models[best_model_name]

print(f"best model is :{best_model_name},R2 score : {best_model_score}")
print("="*92)

best model is :CatBoostRegressor,R2 score : (0.9402698890088388, 0.9888636094169138)
