In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt

## Importing Dataset

In [2]:
train = pd.read_csv("../input/eda-concrete-strength/Filtered_dataset.csv")
train

Unnamed: 0.1,Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
1,5,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90,47.03
2,7,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28,36.45
3,8,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
4,9,475.0,0.0,0.0,228.0,0.0,932.0,594.0,28,39.29
...,...,...,...,...,...,...,...,...,...,...
906,1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
907,1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
908,1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
909,1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [3]:
train.drop(['Unnamed: 0'],axis=1,inplace=True)
train

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
1,266.0,114.0,0.0,228.0,0.0,932.0,670.0,90,47.03
2,380.0,95.0,0.0,228.0,0.0,932.0,594.0,28,36.45
3,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
4,475.0,0.0,0.0,228.0,0.0,932.0,594.0,28,39.29
...,...,...,...,...,...,...,...,...,...
906,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
907,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
908,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
909,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [4]:
x = train.drop(['Strength'],axis=1)
y = train['Strength']

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [6]:
# splitting the dataset into train and test dataset with 4:1 ratio (80%-20%)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2, random_state = 26)

## Training on different algorithms

### Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression

# Create instance of model
lreg = LinearRegression()
# Pass training data into model
lreg.fit(x_train, y_train)

LinearRegression()

In [8]:
# Getting prediciton on x_test
y_pred_lreg = lreg.predict(x_test)

In [9]:
def rmse(y,y_pred):
    return (np.sqrt(mse(y,y_pred)))

In [10]:
# Scoring our model
print('Linear Regression')
# Root mean square error of our model
print('--'*50)
linreg_error = rmse(y_test, y_pred_lreg)
print('RMSE = ', linreg_error)


Linear Regression
----------------------------------------------------------------------------------------------------
RMSE =  8.453318904436872


### RBF SUPPORT VECTOR REGRESSOR

In [11]:
%%time
from sklearn.svm import SVR

svr = SVR(kernel = 'rbf')

# Fit the model on training data
svr.fit(x_train, y_train)

CPU times: user 36.2 ms, sys: 1 ms, total: 37.2 ms
Wall time: 36.5 ms


SVR()

In [12]:
# Getting the predictions for x_test
y_pred_svr = svr.predict(x_test)

In [13]:
print('Support Vector Classifier')
# Root mean square error of our model
print('--'*50)
svr_error = rmse(y_test, y_pred_svr)
print('RMSE = ', svr_error)

Support Vector Classifier
----------------------------------------------------------------------------------------------------
RMSE =  8.749276052031043


### Decision Tree - Regression

In [14]:
%%time
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
# Fit new DT on training data
dtr.fit(x_train, y_train)

CPU times: user 24.1 ms, sys: 10.6 ms, total: 34.7 ms
Wall time: 84.9 ms


DecisionTreeRegressor()

In [15]:
# Predict DTR
y_pred_dtr = dtr.predict(x_test)

In [16]:
print('Decision Tree')
# Root mean square error of our model
print('--'*50)
dtr_error = rmse(y_test, y_pred_dtr)
print('RMSE = ', dtr_error)

Decision Tree
----------------------------------------------------------------------------------------------------
RMSE =  7.02070229077816


### RANDOM FOREST

In [17]:
from sklearn.ensemble import RandomForestRegressor

# Create model object
rfr = RandomForestRegressor(n_estimators = 250,n_jobs=-1)
# Fit model to training data
rfr.fit(x_train,y_train)
y_pred_rfr = rfr.predict(x_test)

In [18]:
print('Random Forest')
# Root mean square error of our model
print('--'*50)
rfr_error = rmse(y_test, y_pred_rfr)
print('RMSE = ', rfr_error)

Random Forest
----------------------------------------------------------------------------------------------------
RMSE =  5.659124247827314


### XGBoost Regressor

In [19]:
from xgboost import XGBRegressor

# Create model object
xgb = XGBRegressor(n_jobs=-1)

# Fit model to training data
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)

In [20]:
print('XGBoost Classifer')
print('--'*50)
xgb_error = rmse(y_test, y_pred_xgb)
print('RMSE = ', xgb_error)

XGBoost Classifer
----------------------------------------------------------------------------------------------------
RMSE =  5.294385434689728


In [21]:
models = pd.DataFrame({
     'Model': ['Linear Regression', 'RBF SVC', 
               'Decision Tree', 'Random Forest','XGBoost Regressor'],
    'Score': [linreg_error, svr_error, 
               dtr_error, rfr_error,xgb_error]})
models.sort_values(by='Score', ascending=True)

Unnamed: 0,Model,Score
4,XGBoost Regressor,5.294385
3,Random Forest,5.659124
2,Decision Tree,7.020702
0,Linear Regression,8.453319
1,RBF SVC,8.749276
