# IMPORTING BASIC LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
import pickle

# LOADING THE DATA AND PEEKING INTO IT

In [2]:
con_data=pd.read_csv("concrete_data.csv")

In [3]:
con_data.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [4]:
con_data.shape

(1030, 9)

In [5]:
con_data.dtypes
# No need to use data formatting/ data encoding

cement                           float64
blast_furnace_slag               float64
fly_ash                          float64
water                            float64
superplasticizer                 float64
coarse_aggregate                 float64
fine_aggregate                   float64
age                                int64
concrete_compressive_strength    float64
dtype: object

In [6]:
con_data.columns

Index(['cement', 'blast_furnace_slag', 'fly_ash', 'water', 'superplasticizer',
       'coarse_aggregate', 'fine_aggregate ', 'age',
       'concrete_compressive_strength'],
      dtype='object')

In [7]:
con_data.describe()
# gives a statistical outlook of the data

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


# DATA PREPROCESSING

In [8]:
con_data.isnull().sum()
# checking for null values if any

cement                           0
blast_furnace_slag               0
fly_ash                          0
water                            0
superplasticizer                 0
coarse_aggregate                 0
fine_aggregate                   0
age                              0
concrete_compressive_strength    0
dtype: int64

In [9]:
q1=con_data.quantile(0.25)
q3=con_data.quantile(0.75)
IQR=q3-q1
print(IQR)
# checking for outliers using IQR for each column in the dataset

cement                           157.625
blast_furnace_slag               142.950
fly_ash                          118.300
water                             27.100
superplasticizer                  10.200
coarse_aggregate                  97.400
fine_aggregate                    93.050
age                               49.000
concrete_compressive_strength     22.425
dtype: float64


In [10]:
a = (con_data < (q1 - 1.5 * IQR)) |(con_data > (q3 + 1.5 * IQR))
print(a)
# The values which indicates True are the outliers

      cement  blast_furnace_slag  fly_ash  water  superplasticizer  \
0      False               False    False  False             False   
1      False               False    False  False             False   
2      False               False    False  False             False   
3      False               False    False  False             False   
4      False               False    False  False             False   
...      ...                 ...      ...    ...               ...   
1025   False               False    False  False             False   
1026   False               False    False  False             False   
1027   False               False    False  False             False   
1028   False               False    False  False             False   
1029   False               False    False  False             False   

      coarse_aggregate  fine_aggregate     age  concrete_compressive_strength  
0                False            False  False                           True  

In [11]:
con_data_IQR = con_data[~((con_data < (q1 - 1.5 * IQR)) |(con_data > (q3 + 1.5 * IQR))).any(axis=1)]
#removed the outliers (~ is used as bitwise NOT and | is used as bitwise OR)
con_data_IQR.shape
# shape of the dataset after removing the outliers

(941, 9)

# SEPERATING FEATURES AND LABEL

In [12]:
X=con_data_IQR.drop(["concrete_compressive_strength"],axis=1)
# preparing a seperate group for independant variables or features by dropping the label column

In [13]:
X.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90
7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28
8,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28
9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,28


In [14]:
y=con_data_IQR[["concrete_compressive_strength"]]
# don't forget to use two brackets to gather the data into root dataset

In [15]:
y.head()

Unnamed: 0,concrete_compressive_strength
1,61.89
5,47.03
7,36.45
8,45.85
9,39.29


In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33, random_state=42)
#using train_test split with test size of one third to the train data

# HYPERPARAMETER TUNING

In [17]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


# MODEL CREATION 

In [18]:
#create a base algorithm model
reg_model=RandomForestRegressor()
#create a final model using the base model as estimator
rf_random = RandomizedSearchCV(estimator = reg_model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
#fitting the data to final model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [19]:
# This will generate the best values for the parameters
rf_random.best_params_

{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [20]:
# checking the values of predictions made by the model and storing it into y_predictions
y_predictions=rf_random.predict(X_test)

In [21]:
# decide the performance criteria for the model to check it's accuracy
mean_absolute_error(y_predictions,y_test)

3.9838710467568554

# CHECKING FOR NEW INPUTS

In [22]:
# Testing the model for new inputs
# cement=float(input("Enter the quantity of cement in Kg: "))
# blast_furnace_slag=float(input("Enter the quantity of BFS in Kg: "))
# fly_ash=float(input("Enter the quantity of Fly Ash in Kg: "))
# water=float(input("Enter the quantity of water in Kg: "))
# superplasticizer=float(input("Enter the quantity of SP in Kg: "))
# coarse_aggregate=float(input("Enter the quantity of CA in Kg: "))
# fine_aggregate=float(input("Enter the quantity of FA in Kg: "))
# age=int(input("Curing period in days: "))
# variables_list=[cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age]
# z=[variables_list,]
# y_pred=rf_random.predict(z)
# print("The value of compressive strength is: ",y_pred)


# SAVING THE MODEL IN A PICKLE FILE

In [23]:
model1_rf = 'finalized_model_RF.pkl'
pickle.dump(rf_random, open(model1_rf, 'wb'))