In [1]:
import pandas as pd

In [2]:
df= pd.read_csv('./data/concrete_data.csv')
df.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
df.columns = df.columns.str.strip()

In [4]:
df.drop_duplicates(inplace=True)
df.reset_index()

Unnamed: 0,index,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...,...
1000,1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1001,1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1002,1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1003,1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [5]:
# Dividing in Independent and Dependent Features
X= df.drop(labels=['concrete_compressive_strength'], axis=1)
y= df[['concrete_compressive_strength']]

In [6]:
num_cols= ['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'superplasticizer',
       'coarse_aggregate', 'fine_aggregate', 'age']

In [7]:
from sklearn.impute import SimpleImputer # Handlimg Missing Values
from sklearn.preprocessing import StandardScaler # Handling feature scaling

# Pipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
# Pipeline
pipeline= Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

preprocessor= ColumnTransformer([
    ('pipeline', pipeline, num_cols)
])

In [9]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=30)

In [10]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [11]:
X_train.head()

Unnamed: 0,pipeline__cement,pipeline__blast_furnace_slag,pipeline__fly_ash,pipeline__water,pipeline__superplasticizer,pipeline__coarse_aggregate,pipeline__fine_aggregate,pipeline__age
0,0.22201,-0.811359,-0.874463,0.957626,-1.029902,-0.025451,0.570809,-0.284792
1,-1.408323,1.631978,-0.874463,0.620847,-0.501194,-0.344202,0.296987,-0.284792
2,0.327224,1.054272,-0.874463,-0.214179,0.607385,-0.769204,-0.154756,-0.284792
3,2.37455,-0.811359,-0.874463,0.311749,-1.029902,1.931114,-2.00337,3.703365
4,-0.895767,0.631163,-0.874463,0.159507,-1.029902,1.392087,-0.094187,-0.284792


In [12]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


In [13]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [23]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [27]:
## Train multiple models
## Model Evaluation
models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'RandomForestRegressor':RandomForestRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
RMSE: 11.794428661838294
MAE: 8.92858363828157
R2 score 53.06317314026151


Lasso
Model Training Performance
RMSE: 12.367006203849241
MAE: 9.826869207913221
R2 score 48.39532259919429


Ridge
Model Training Performance
RMSE: 11.802850342465044
MAE: 8.938731150605383
R2 score 52.99611977092687


Elasticnet
Model Training Performance
RMSE: 12.767459368070684
MAE: 10.301843032014784
R2 score 44.99921614496645




  return fit_method(estimator, *args, **kwargs)


RandomForestRegressor
Model Training Performance
RMSE: 5.418145868709413
MAE: 3.6009784683724217
R2 score 90.09484453700084


DecisionTreeRegressor
Model Training Performance
RMSE: 5.672964655722317
MAE: 3.88589552238806
R2 score 89.14124421874757




In [16]:
from sklearn.model_selection import GridSearchCV
reg= RandomForestRegressor()

In [17]:
parms= {
    'criterion':[ 'squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features':['sqrt', 'log2', None],
    'max_depth': [1,2,3,4,5]
}

In [18]:
cv = GridSearchCV(reg, param_grid=parms, cv=5, scoring='r2')
cv.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

In [19]:
y_pred=cv.predict(X_test)

In [20]:
cv.best_score_

0.8083770798801734

In [21]:
r2_score(y_test,y_pred)

0.8344791546663489

In [22]:
cv.best_params_

{'criterion': 'friedman_mse', 'max_depth': 5, 'max_features': None}

In [28]:
dtreg= DecisionTreeRegressor()

In [29]:
parms= {
    'criterion':[ 'squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'max_features':['sqrt', 'log2', None],
    'max_depth': [1,2,3,4,5],
    'splitter': ['best', 'random']
}

In [30]:
cv = GridSearchCV(dtreg, param_grid=parms, cv=5, scoring='r2')
cv.fit(X_train, y_train)

In [31]:
y_pred=cv.predict(X_test)

In [32]:
cv.best_score_

0.6757207378944232

In [33]:
r2_score(y_test,y_pred)

0.7026128298951284

In [34]:
cv.best_params_

{'criterion': 'poisson',
 'max_depth': 5,
 'max_features': None,
 'splitter': 'best'}