<a href="https://www.kaggle.com/code/ahmadsoliman94/concrete-strength-predction-92?scriptVersionId=95958752" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
os.chdir('/kaggle/input/yeh-concret-data/')


In [None]:
!ls

## Import Data

In [None]:
df = pd.read_csv('Concrete_Data_Yeh.csv')
df

In [None]:
df.columns

## Data Visualization


In [None]:
plt.figure(figsize=(3,2))
sns.pairplot(df)
plt.show()


In [None]:
# Histogram
df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Heatmap
corr_matrix = df.corr()
plt.figure(figsize = (10,10))
sns_plot=sns.heatmap(corr_matrix,annot=True)
plt.show()

In [None]:
sns.lineplot(data=df,x='age', y='cement')

In [None]:
sns.boxplot(data=df,x='age', y='cement')

## Data Cleaning


### Remove unnecessary columns

In [None]:
df.columns


In [None]:
df.drop(['flyash','water','coarseaggregate', 'fineaggregate'],axis=1,inplace=True)

In [None]:
df.corr()


In [None]:
df.describe()


# Check for Missing Data

In [None]:
print(df.isnull().sum().sort_values(ascending=False))

In [None]:
df.info()

In [None]:
df.shape

## Over-Sampling Technique for Regression

In [None]:
import smogn
df_smogn = smogn.smoter(
    data=df,
    y='csMPa',
    k=9,
    samp_method = 'extreme',
    rel_thres = 0.80,         ## positive real number (0 < R < 1)
    rel_method = 'auto',      ## string ('auto' or 'manual')
    rel_xtrm_type = 'high',   ## string ('low' or 'both' or 'high')
    rel_coef = 1
)

In [None]:
df_smogn.shape

In [None]:
X_res = df_smogn.drop('csMPa',axis=1)
y_res = df_smogn['csMPa'].values

print(f' X_shape: {X_res.shape} \n y_shape: {y_res.shape}')

In [None]:
X = df.drop('csMPa',axis=1)
y = df['csMPa'].values

print(f' X_shape: {X.shape} \n y_shape: {y.shape}')

## Data Scaling


In [None]:
# Standarization
from sklearn.preprocessing import StandardScaler
scl = StandardScaler()
X = scl.fit_transform(X)

In [None]:
# Standarization
from sklearn.preprocessing import StandardScaler
scl = StandardScaler()
X_res = scl.fit_transform(X_res)

In [None]:
X.shape

In [None]:
X_res.shape

## Split data into train and test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,shuffle=True,random_state=0)
print(f' X_train: {X_train.shape} & X_test: {X_test.shape}')
print(f' y_train: {y_train.shape} & y_test: {y_test.shape}')

In [None]:
from sklearn.model_selection import train_test_split
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X_res,y_res,test_size=0.20,shuffle=True,random_state=0)
print(f' X_train: {X_train_res.shape} & X_test: {X_test_res.shape}')
print(f' y_train: {y_train_res.shape} & y_test: {y_test_res.shape}')

## Model Selection


### RandomForest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train, y_train)
y_pred_test = rf_reg.predict(X_test)
y_pred_train = rf_reg.predict(X_train)

rf_reg.fit(X_train_res, y_train_res)
y_pred_test_res = rf_reg.predict(X_test_res)
y_pred_train_res = rf_reg.predict(X_train_res)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

print('R Squared of Train befor Oversampling: {}'.format(r2_score(y_train,y_pred_train)*100))
print('R Squared of Test  befor Oversampling: {}'.format(r2_score(y_test,y_pred_test)*100))
print('********************************************************')
print('R Squared of Train after Oversampling: {}'.format(r2_score(y_train_res,y_pred_train_res)*100))
print('R Squared of Test after Oversampling: {}'.format(r2_score(y_test_res,y_pred_test_res)*100))

### Use Grid Search CV

In [None]:
rf_reg.get_params().keys()

In [None]:
from sklearn.model_selection import GridSearchCV

parameters= {'n_estimators':[30,40,50,100],
             'criterion':['squared_error','absolute_error'],
             'max_depth':[3,4,5,6,7,8,9,11],
             'max_features':['sqrt','log2']
            }

grid_search = GridSearchCV(estimator = rf_reg,        
                           param_grid = parameters,          
                           cv = 5,
                           scoring='r2',                     
                           n_jobs = 1,
                           verbose=0)

In [None]:
rf_reg_res = RandomForestRegressor()

parameters= {'n_estimators':[30,40,50,100],
             'criterion':['squared_error','absolute_error'],
             'max_depth':[3,4,5,6,7,8,9,11],
             'max_features':['sqrt','log2']
            }

grid_search_res = GridSearchCV(estimator = rf_reg_res,        
                           param_grid = parameters,          
                           cv = 5,
                           scoring='r2',                     
                           n_jobs = 1,
                           verbose=0)

In [None]:
grid_search = grid_search.fit(X_train, y_train)
print("best r2 score befor Oversampleing:" , grid_search.best_score_ * 100)
grid_search.best_params_

In [None]:
grid_search_res = grid_search_res.fit(X_train_res, y_train_res)
print("best r2 score after Oversampleing:" , grid_search_res.best_score_ * 100)
grid_search_res.best_params_

### Applying k-Fold Cross Validation


In [None]:
rf_reg = grid_search.best_estimator_
rf_reg.fit(X_train, y_train)

In [None]:
rf_reg_res = grid_search_res.best_estimator_
rf_reg_res.fit(X_train_res, y_train_res)

In [None]:
from sklearn.model_selection import cross_val_score
rf_reg_results = cross_val_score(estimator = rf_reg, X = X_train, y = y_train, cv = 5)
rf_reg_results_res = cross_val_score(estimator = rf_reg_res, X = X_train_res, y = y_train_res, cv = 5)

rf_reg_pred = rf_reg.predict(X_test)
rf_reg_res_pred = rf_reg_res.predict(X_test_res)


print("Validation score befor Oversampling: %.5f%% (%.5f%%)" % (rf_reg_results.mean()*100.0, rf_reg_results.std()*100.0))
print("Validation score after Oversampling: %.5f%% (%.5f%%)" % (rf_reg_results_res.mean()*100.0, rf_reg_results_res.std()*100.0))
print('********************************************************')

print('R Squared of Test  befor Oversampling: {}'.format(r2_score(y_test,rf_reg_pred)*100))
print('R Squared of Test after Oversampling: {}'.format(r2_score(y_test_res,rf_reg_res_pred)*100))

### Results Visualization

In [None]:
plt.Figure(figsize=(8,10))
ax = sns.regplot(x=y_test, y=rf_reg_pred,
                 scatter_kws={"color": "blue"}, line_kws={"color": "red"})
plt.title('RandomForest Regressor (Befor Oversampling)')
plt.xlabel('True')
plt.ylabel('Prediction')
plt.show()

In [None]:
plt.Figure(figsize=(8,10))
ax = sns.regplot(x=y_test_res, y=rf_reg_res_pred,
                 scatter_kws={"color": "blue"}, line_kws={"color": "red"})
plt.title('RandomForest Regressor (After Oversampling)')
plt.xlabel('True')
plt.ylabel('Predction')
plt.show()

### XGB Regressor


In [None]:
import xgboost as xgb
xgb_reg = xgb.XGBRegressor()

xgb_reg_res = xgb.XGBRegressor()

In [None]:
xgb_reg.fit(X_train, y_train)
y_pred_test = xgb_reg.predict(X_test)
y_pred_train = xgb_reg.predict(X_train)

xgb_reg.fit(X_train_res, y_train_res)
y_pred_test_res = xgb_reg.predict(X_test_res)
y_pred_train_res = xgb_reg.predict(X_train_res)

In [None]:
print('R Squared of Train befor Oversampling: {}'.format(r2_score(y_train,y_pred_train)*100))
print('R Squared of Test  befor Oversampling: {}'.format(r2_score(y_test,y_pred_test)*100))
print('********************************************************')
print('R Squared of Train after Oversampling: {}'.format(r2_score(y_train_res,y_pred_train_res)*100))
print('R Squared of Test after Oversampling: {}'.format(r2_score(y_test_res,y_pred_test_res)*100))

In [None]:
xgb_reg.get_params().keys()


In [None]:
from sklearn.model_selection import RandomizedSearchCV

parameters= {'booster':['gbtree','gblinear'],
             'base_score' : [0.5],
             'objective':['reg:squarederror'],
             'validate_parameters':[True],
             'max_depth':[6,7,8,9,11,13,15],
             'learning_rate':[0.01,0.03,0.1,0.3],
             'n_estimators':[100,200,300],
             'reg_alpha':[0,1,2,3],
             'reg_lambda':[0,1,2,3],
             'random_state': [42]
            }

rand = RandomizedSearchCV(estimator = xgb_reg,       
                           param_distributions = parameters,          
                           cv = 5,
                           scoring='r2',                     
                           n_jobs = 1,
                           verbose=0)      


rand_res = RandomizedSearchCV(estimator = xgb_reg_res,        
                           param_distributions = parameters,          
                           cv = 5,
                           scoring='r2',                     
                           n_jobs = 1,
                           verbose=0)

In [None]:
rand = rand.fit(X_train, y_train)
print("best r2 score befor Oversampleing:" , rand.best_score_ * 100)
rand.best_params_

In [None]:
rand_res = rand_res.fit(X_train_res, y_train_res)
print("best r2 score after Oversampleing:" , rand_res.best_score_ * 100)
rand_res.best_params_

In [None]:
xgb_reg = rand.best_estimator_
xgb_reg.fit(X_train, y_train)

In [None]:
xgb_reg_res = rand_res.best_estimator_
xgb_reg_res.fit(X_train_res, y_train_res)

In [None]:
xgb_reg_results = cross_val_score(estimator = xgb_reg, X = X_train, y = y_train, cv = 5)
xgb_reg_pred = xgb_reg.predict(X_test)

In [None]:
xgb_reg_results_res = cross_val_score(estimator = xgb_reg_res, X = X_train_res, y = y_train_res, cv = 5)
xgb_reg_res_pred = xgb_reg_res.predict(X_test_res)

In [None]:
print("Validation score befor Oversampling: %.5f%% (%.5f%%)" % (xgb_reg_results.mean()*100.0, xgb_reg_results.std()*100.0))
print("Validation score after Oversampling: %.5f%% (%.5f%%)" % (xgb_reg_results_res.mean()*100.0, xgb_reg_results_res.std()*100.0))
print('********************************************************')
print('R Squared of Test  befor Oversampling: {}'.format(r2_score(y_test,xgb_reg_pred)*100))
print('R Squared of Test after Oversampling: {}'.format(r2_score(y_test_res,xgb_reg_res_pred)*100))

In [None]:
plt.Figure(figsize=(8,10))
ax = sns.regplot(x=y_test, y=xgb_reg_pred,
                 scatter_kws={"color": "blue"}, line_kws={"color": "red"})
plt.title('XGB Regressor (Befor Oversampling)')
plt.xlabel('True')
plt.ylabel('Prediction')
plt.show()


In [None]:
plt.Figure(figsize=(8,10))
ax = sns.regplot(x=y_test_res, y=xgb_reg_res_pred,
                 scatter_kws={"color": "blue"}, line_kws={"color": "red"})
plt.title('XGB Regressor (After Oversampling)')
plt.xlabel('True')
plt.ylabel('Predction')
plt.show()

### Voting Regressor

In [None]:
from sklearn.ensemble import VotingRegressor
voting_reg = VotingRegressor([('rf',rf_reg),('xgb',xgb_reg)],verbose=True)
voting_reg.fit(X_train,y_train)
voting_reg_results = cross_val_score(estimator = voting_reg, X = X_train, y = y_train, cv = 5)
voting_reg_pred = voting_reg.predict(X_test)

In [None]:
print("Validation score befor Oversampling: %.5f%% (%.5f%%)" % (voting_reg_results.mean()*100.0, voting_reg_results.std()*100.0))
print('R Squared of Test befor Oversampling: {}'.format(r2_score(y_test,voting_reg_pred)*100))

In [None]:
voting_reg_res = VotingRegressor([('xgb',xgb_reg),('rf',rf_reg)])
voting_reg_res.fit(X_train_res,y_train_res)
voting_reg_results_res = cross_val_score(estimator = voting_reg_res, X = X_train_res, y = y_train_res, cv = 5)
voting_reg_pred_res = voting_reg_res.predict(X_test_res)

In [None]:
print("Validation score befor Oversampling: %.5f%% (%.5f%%)" % (voting_reg_results_res.mean()*100.0, voting_reg_results_res.std()*100.0))
print('R Squared of Test after Oversampling: {}'.format(r2_score(y_test_res,voting_reg_pred_res)*100))

In [None]:
rf = r2_score(y_test_res,rf_reg_res_pred)*100
xgb = r2_score(y_test_res,xgb_reg_res_pred)*100
voting = r2_score(y_test_res,voting_reg_pred_res)*100
scores = [rf,xgb,voting]
labels = ['Random Forest Regressor', 'XGB Regressor','Voting Regressor']
accs = {'R2 Score': scores}
df = pd.DataFrame(accs,index=labels)
df