In [3]:
import pandas as pd
import numpy as np
from itertools import combinations
from statistics import mean,stdev

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.model_selection import KFold

import pickle

In [45]:
df = pd.read_csv('Concrete_Data_Yeh.csv')

In [46]:
pd.set_option('display.max_columns',None)

In [47]:
df.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [48]:
target = df['csMPa'].copy()
df.drop('csMPa',axis=1,inplace=True)

In [49]:
for cols in df.columns:
    print(str(df[cols].min()) + '  ' + str(df[cols].max()))

102.0  540.0
0.0  359.4
0.0  200.1
121.8  247.0
0.0  32.2
801.0  1145.0
594.0  992.6
1  365


In [50]:
col_pairwise_combos = list(combinations(df.columns[:-1],2))

In [51]:
for a,b in col_pairwise_combos:
    if df[a].min() != 0.0 and df[b].min() != 0:
        df[str(a) + '/' + str(b)] = df[a]/df[b]
    elif df[a].min() == 0 and df[b].min() != 0:
        df[str(a) + '/' + str(b)] = df[a]/df[b]
    elif df[a].min() != 0 and df[b].min() == 0:
        df[str(b) + '/' + str(a)] = df[b]/df[a]

In [52]:
df

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,slag/cement,flyash/cement,cement/water,superplasticizer/cement,cement/coarseaggregate,cement/fineaggregate,slag/water,slag/coarseaggregate,slag/fineaggregate,flyash/water,flyash/coarseaggregate,flyash/fineaggregate,superplasticizer/water,water/coarseaggregate,water/fineaggregate,superplasticizer/coarseaggregate,superplasticizer/fineaggregate,coarseaggregate/fineaggregate
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,0.000000,0.000000,3.333333,0.004630,0.519231,0.798817,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.015432,0.155769,0.239645,0.002404,0.003698,1.538462
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,0.000000,0.000000,3.333333,0.004630,0.511848,0.798817,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.015432,0.153555,0.239645,0.002370,0.003698,1.560651
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,0.428571,0.000000,1.458333,0.000000,0.356760,0.559764,0.625000,0.152897,0.239899,0.000000,0.000000,0.000000,0.000000,0.244635,0.383838,0.000000,0.000000,1.569024
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,0.428571,0.000000,1.458333,0.000000,0.356760,0.559764,0.625000,0.152897,0.239899,0.000000,0.000000,0.000000,0.000000,0.244635,0.383838,0.000000,0.000000,1.569024
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,0.666667,0.000000,1.034375,0.000000,0.202984,0.240581,0.689583,0.135323,0.160388,0.000000,0.000000,0.000000,0.000000,0.196239,0.232586,0.000000,0.000000,1.185221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,0.419682,0.326700,1.538976,0.032200,0.317665,0.359755,0.645880,0.133318,0.150983,0.502784,0.103781,0.117532,0.049555,0.206413,0.233763,0.010229,0.011584,1.132500
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,0.000000,0.358783,1.643878,0.032278,0.393936,0.396115,0.000000,0.000000,0.000000,0.589796,0.141338,0.142119,0.053061,0.239638,0.240964,0.012715,0.012786,1.005532
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,0.938721,0.731313,0.770628,0.041077,0.166405,0.190385,0.723404,0.156208,0.178718,0.563570,0.121694,0.139231,0.031655,0.215935,0.247051,0.006835,0.007821,1.144103
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,1.173476,0.000000,0.906036,0.071025,0.160772,0.201673,1.063212,0.188662,0.236659,0.000000,0.000000,0.000000,0.064351,0.177445,0.222588,0.011419,0.014324,1.254405


In [53]:
for cols in df.columns:
    if df[cols].min() != 0:
        df['1/('+str(cols) + ')'] = 1/df[cols]
        df['ln ' + str(cols)] = np.log(df[cols])
df

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,slag/cement,flyash/cement,cement/water,superplasticizer/cement,cement/coarseaggregate,cement/fineaggregate,slag/water,slag/coarseaggregate,slag/fineaggregate,flyash/water,flyash/coarseaggregate,flyash/fineaggregate,superplasticizer/water,water/coarseaggregate,water/fineaggregate,superplasticizer/coarseaggregate,superplasticizer/fineaggregate,coarseaggregate/fineaggregate,1/(cement),ln cement,1/(water),ln water,1/(coarseaggregate),ln coarseaggregate,1/(fineaggregate),ln fineaggregate,1/(age),ln age,1/(cement/water),ln cement/water,1/(cement/coarseaggregate),ln cement/coarseaggregate,1/(cement/fineaggregate),ln cement/fineaggregate,1/(water/coarseaggregate),ln water/coarseaggregate,1/(water/fineaggregate),ln water/fineaggregate,1/(coarseaggregate/fineaggregate),ln coarseaggregate/fineaggregate
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,0.000000,0.000000,3.333333,0.004630,0.519231,0.798817,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.015432,0.155769,0.239645,0.002404,0.003698,1.538462,0.001852,6.291569,0.006173,5.087596,0.000962,6.946976,0.001479,6.516193,0.035714,3.332205,0.300000,1.203973,1.925926,-0.655407,1.251852,-0.224624,6.419753,-1.859380,4.172840,-1.428597,0.650000,0.430783
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,0.000000,0.000000,3.333333,0.004630,0.511848,0.798817,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.015432,0.153555,0.239645,0.002370,0.003698,1.560651,0.001852,6.291569,0.006173,5.087596,0.000948,6.961296,0.001479,6.516193,0.035714,3.332205,0.300000,1.203973,1.953704,-0.669727,1.251852,-0.224624,6.512346,-1.873700,4.172840,-1.428597,0.640758,0.445103
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,0.428571,0.000000,1.458333,0.000000,0.356760,0.559764,0.625000,0.152897,0.239899,0.000000,0.000000,0.000000,0.000000,0.244635,0.383838,0.000000,0.000000,1.569024,0.003008,5.806640,0.004386,5.429346,0.001073,6.837333,0.001684,6.386879,0.003704,5.598422,0.685714,0.377294,2.803008,-1.030693,1.786466,-0.580239,4.087719,-1.407987,2.605263,-0.957534,0.637339,0.450453
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,0.428571,0.000000,1.458333,0.000000,0.356760,0.559764,0.625000,0.152897,0.239899,0.000000,0.000000,0.000000,0.000000,0.244635,0.383838,0.000000,0.000000,1.569024,0.003008,5.806640,0.004386,5.429346,0.001073,6.837333,0.001684,6.386879,0.002740,5.899897,0.685714,0.377294,2.803008,-1.030693,1.786466,-0.580239,4.087719,-1.407987,2.605263,-0.957534,0.637339,0.450453
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,0.666667,0.000000,1.034375,0.000000,0.202984,0.240581,0.689583,0.135323,0.160388,0.000000,0.000000,0.000000,0.000000,0.196239,0.232586,0.000000,0.000000,1.185221,0.005035,5.291293,0.005208,5.257495,0.001022,6.885919,0.001211,6.715989,0.002778,5.886104,0.966767,0.033797,4.926485,-1.594626,4.156596,-1.424697,5.095833,-1.628423,4.299479,-1.458494,0.843724,0.169929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,0.419682,0.326700,1.538976,0.032200,0.317665,0.359755,0.645880,0.133318,0.150983,0.502784,0.103781,0.117532,0.049555,0.206413,0.233763,0.010229,0.011584,1.132500,0.003618,5.621849,0.005568,5.190732,0.001149,6.768608,0.001302,6.644180,0.035714,3.332205,0.649783,0.431117,3.147974,-1.146759,2.779667,-1.022331,4.844655,-1.577876,4.277840,-1.453448,0.883002,0.124428
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,0.000000,0.358783,1.643878,0.032278,0.393936,0.396115,0.000000,0.000000,0.000000,0.589796,0.141338,0.142119,0.053061,0.239638,0.240964,0.012715,0.012786,1.005532,0.003104,5.775172,0.005102,5.278115,0.001223,6.706740,0.001229,6.701223,0.035714,3.332205,0.608318,0.497058,2.538485,-0.931568,2.524519,-0.926051,4.172959,-1.428625,4.150000,-1.423108,0.994498,0.005517
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,0.938721,0.731313,0.770628,0.041077,0.166405,0.190385,0.723404,0.156208,0.178718,0.563570,0.121694,0.139231,0.031655,0.215935,0.247051,0.006835,0.007821,1.144103,0.006734,5.000585,0.005189,5.261135,0.001121,6.793914,0.001282,6.659294,0.035714,3.332205,1.297643,-0.260550,6.009428,-1.793330,5.252525,-1.658709,4.631033,-1.532780,4.047743,-1.398159,0.874048,0.134621
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,1.173476,0.000000,0.906036,0.071025,0.160772,0.201673,1.063212,0.188662,0.236659,0.000000,0.000000,0.000000,0.064351,0.177445,0.222588,0.011419,0.014324,1.254405,0.006285,5.069533,0.005695,5.168209,0.001011,6.897301,0.001268,6.670640,0.035714,3.332205,1.103708,-0.098676,6.219987,-1.827768,4.958517,-1.601107,5.635535,-1.729092,4.492597,-1.502431,0.797191,0.226661


In [54]:
scaler = StandardScaler()
scaler.fit(df)
pickle.dump(scaler, open('scaler.sav', 'wb'))
scaled_data = pd.DataFrame(scaler.fit_transform(df),columns = df.columns)
scaled_data

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,slag/cement,flyash/cement,cement/water,superplasticizer/cement,cement/coarseaggregate,cement/fineaggregate,slag/water,slag/coarseaggregate,slag/fineaggregate,flyash/water,flyash/coarseaggregate,flyash/fineaggregate,superplasticizer/water,water/coarseaggregate,water/fineaggregate,superplasticizer/coarseaggregate,superplasticizer/fineaggregate,coarseaggregate/fineaggregate,1/(cement),ln cement,1/(water),ln water,1/(coarseaggregate),ln coarseaggregate,1/(fineaggregate),ln fineaggregate,1/(age),ln age,1/(cement/water),ln cement/water,1/(cement/coarseaggregate),ln cement/coarseaggregate,1/(cement/fineaggregate),ln cement/fineaggregate,1/(water/coarseaggregate),ln water/coarseaggregate,1/(water/fineaggregate),ln water/fineaggregate,1/(coarseaggregate/fineaggregate),ln coarseaggregate/fineaggregate
0,2.477915,-0.856888,-0.847144,-0.916764,-0.620448,0.863154,-1.217670,-0.279733,-0.743115,-0.750231,2.709300,-0.834753,1.970510,2.706095,-0.862468,-0.856648,-0.829235,-0.834790,-0.842413,-0.834926,-0.561907,-1.107982,0.018053,-0.635140,-0.561328,1.426392,-1.408121,1.899379,0.860915,-0.897697,-0.868360,0.868137,1.198771,-1.215636,-0.439830,0.140131,-1.428268,2.027577,-1.316909,1.647631,-1.443432,2.047678,1.159255,-1.146623,-0.208498,0.116268,-1.317060,1.386828
1,2.477915,-0.856888,-0.847144,-0.916764,-0.620448,1.056164,-1.217670,-0.279733,-0.743115,-0.750231,2.709300,-0.834753,1.906495,2.706095,-0.862468,-0.856648,-0.829235,-0.834790,-0.842413,-0.834926,-0.561907,-1.183783,0.018053,-0.640365,-0.561328,1.545960,-1.408121,1.899379,0.860915,-0.897697,-1.031030,1.045995,1.198771,-1.215636,-0.439830,0.140131,-1.428268,2.027577,-1.299272,1.611516,-1.443432,2.047678,1.269163,-1.239438,-0.208498,0.116268,-1.397428,1.486532
2,0.491425,0.795526,-0.847144,2.175461,-1.039143,-0.526517,-2.240917,3.553066,0.173351,-0.750231,-0.185162,-1.027141,0.561673,1.192994,0.462435,0.813492,1.158519,-0.834790,-0.842413,-0.834926,-0.956453,1.933542,3.050413,-1.002615,-1.052735,1.591077,-0.686020,0.625818,-1.762482,1.969231,0.457434,-0.493651,2.622925,-2.431511,-0.725590,2.043122,-0.199303,0.008670,-0.760025,0.701187,-1.047124,1.193509,-1.608898,1.779088,-2.109841,2.574578,-1.427162,1.523784
3,0.491425,0.795526,-0.847144,2.175461,-1.039143,-0.526517,-2.240917,5.057677,0.173351,-0.750231,-0.185162,-1.027141,0.561673,1.192994,0.462435,0.813492,1.158519,-0.834790,-0.842413,-0.834926,-0.956453,1.933542,3.050413,-1.002615,-1.052735,1.591077,-0.686020,0.625818,-1.762482,1.969231,0.457434,-0.493651,2.622925,-2.431511,-0.734196,2.296278,-0.199303,0.008670,-0.760025,0.701187,-1.047124,1.193509,-1.608898,1.779088,-2.109841,2.574578,-1.427162,1.523784
4,-0.790459,0.678408,-0.847144,0.488793,-1.039143,0.070527,0.647884,4.978487,0.682500,-0.750231,-0.839632,-1.027141,-0.771759,-0.827301,0.599342,0.621526,0.499704,-0.834790,-0.842413,-0.834926,-0.956453,0.277127,-0.130389,-1.002615,-1.052735,-0.477051,0.580976,-0.727630,-0.555123,0.527583,-0.148028,0.109793,-0.669560,0.662950,-0.733856,2.284695,0.696189,-0.830215,0.588233,-0.721011,0.709845,-0.834831,-0.412255,0.350327,-0.054895,-0.039754,0.367600,-0.429361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025,-0.045645,0.488235,0.564545,-0.092171,0.451410,-1.323005,-0.065893,-0.279733,0.154341,0.195837,-0.060673,0.310941,0.222668,-0.072980,0.506697,0.599625,0.421777,0.504382,0.714656,0.569369,0.310489,0.625352,-0.105646,0.561046,0.486507,-0.761137,-0.304604,0.140505,-0.027184,-0.032493,1.365688,-1.347223,-0.040598,-0.012235,-0.439830,0.140131,-0.313788,0.140115,-0.540996,0.408476,-0.310868,0.131628,-0.710407,0.677950,-0.081142,-0.013422,0.709164,-0.746165
1026,0.392819,-0.856888,0.960068,0.676200,0.702626,-1.994680,0.496893,-0.279733,-0.743115,0.288744,0.101265,0.314198,0.884038,0.157162,-0.862468,-0.856648,-0.829235,0.736139,1.278130,0.863142,0.400142,1.762511,0.045789,0.941200,0.646200,-1.445307,-0.625946,0.543175,-0.711176,0.700558,2.238464,-2.115636,-0.543887,0.524110,-0.439830,0.140131,-0.445904,0.301155,-0.927977,0.951173,-0.500008,0.362889,-1.507718,1.645320,-0.236201,0.144910,1.678754,-1.574079
1027,-1.270088,0.759579,0.850635,0.521589,-0.017528,-1.036064,0.080107,-0.279733,1.264266,1.367526,-1.246781,0.679862,-1.088949,-1.145026,0.671037,0.849659,0.651586,0.666287,0.983414,0.828627,-0.147131,0.951235,0.173807,0.042326,-0.013574,-0.698618,1.642420,-1.491110,-0.582900,0.558112,1.023962,-1.032914,-0.176754,0.129871,-0.439830,0.140131,1.750427,-1.549067,1.275824,-1.222127,1.522253,-1.396916,-0.963979,0.970241,-0.360231,0.275110,0.631295,-0.675198
1028,-1.168610,1.308065,-0.847144,-0.279579,0.853356,0.214641,0.191166,-0.279733,1.766271,-0.750231,-1.037749,1.924334,-1.137796,-1.073574,1.391376,1.204165,1.131670,-0.834790,-0.842413,-0.834926,0.688778,-0.366093,-0.340643,0.742968,0.850552,-0.104252,1.362087,-1.310034,0.159025,-0.221442,-0.285668,0.251163,-0.277621,0.236549,-0.439830,0.140131,1.132511,-1.153740,1.409515,-1.308978,1.304305,-1.258559,0.228378,-0.302161,0.179341,-0.269046,-0.037064,-0.034366


In [55]:
pca = PCA(n_components=0.99,random_state=50)
pca.fit(scaled_data)
pickle.dump(pca, open('pca.sav', 'wb'))
final_features = pd.DataFrame(pca.fit_transform(scaled_data))
final_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,6.052330,4.502948,-3.317818,0.679425,-0.770643,-0.133235,1.133875,-0.648750,1.990558,0.249051
1,6.036247,4.498851,-3.658823,0.752637,-0.890184,-0.157061,1.073925,-0.659522,1.943766,0.180385
2,7.668588,-5.043283,0.257956,1.172421,2.310505,-2.627380,0.058931,1.909059,-0.313211,0.459529
3,7.784342,-5.116810,0.252431,1.089744,2.390941,-3.546148,0.106546,3.022443,-0.392938,0.411067
4,-0.210418,-3.317696,1.101696,-1.374878,-2.412655,-4.489579,0.569386,3.051199,-0.557445,-0.375148
...,...,...,...,...,...,...,...,...,...,...
1025,-0.316041,0.414481,2.777331,0.557456,1.840568,0.080705,0.860960,-0.391101,-0.579458,0.025703
1026,0.255629,1.445847,4.393228,-2.532102,4.042204,0.332556,-0.147057,-0.494856,-0.300429,-0.216338
1027,-3.829081,-4.313513,2.214725,0.186320,1.550776,0.089199,0.343056,-0.245346,0.495208,0.099195
1028,-3.409751,-2.409986,0.921587,3.001032,-2.294602,-0.858226,-1.982926,-0.412041,0.218272,0.368514


In [56]:
def stepwise(modeltype,feats,labels,randomstate):
    model = modeltype
    model.random_state = randomstate
    pred_label = labels.copy()
    best_labels = labels.copy()
    kfold = KFold(n_splits=5,shuffle=True,random_state=randomstate)
    
    best_error = 100000
    current_cols = []
    count = 0
    
    while count < len(feats.columns):
        
        col_name = ''
        for cols in feats.columns:
            if cols not in current_cols:
                current_cols.append(cols)
                X = feats[current_cols].copy()
                
                if str(type(model)) != "<class 'sklearn.ensemble._forest.RandomForestRegressor'>":
                    for train,test in kfold.split(feats):
                        xtrain,xtest,ytrain,ytest = X.iloc[train],X.iloc[test],labels.iloc[train],labels.iloc[test]
                        model.fit(xtrain,ytrain)
                        pred_label.iloc[test] = model.predict(xtest)

                error = np.sqrt(mean_squared_error(labels,pred_label))
                if error < best_error:
                    best_error = error
                    best_labels = pred_label
                    col_name = cols
            
                current_cols.pop()
                
        if col_name != '':
            current_cols.append(col_name)
            count = count + 1
        else:
            count = len(feats.columns)
    
    return best_error,current_cols,best_labels
            

In [57]:
def stepgrid(x,y,state,counter):
    overall_errors = []
    overall_cols = []
    newdf = pd.DataFrame()
    count = counter
    
    error,cols,labels = stepwise(LinearRegression(),x,y,state)
    overall_errors.append(error)
    overall_cols.append(cols)
    newdf[count] = labels
    model = LinearRegression()
    model.fit(x[cols],y)
    pickle.dump(model, open('model' + str(count) + '.sav', 'wb'))
    count = count + 1
    
    linear_hyper = [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]
    knn_hyper = [11,21,31,41,51,61,71,81,91,101]
    
    iter_error = []
    iter_cols = []
    iter_labels = []
    
    for params in linear_hyper:
        error,cols,labels = stepwise(Lasso(alpha=params,max_iter=5000),x,y,state)
        iter_error.append(error)
        iter_cols.append(cols)
        iter_labels.append(labels)
    
    overall_errors.append(iter_error[np.argmin(iter_error)])
    overall_cols.append(iter_cols[np.argmin(iter_error)])
    newdf[count] = iter_labels[np.argmin(iter_error)]
    model = Lasso(alpha = linear_hyper[np.argmin(iter_error)])
    model.fit(x[overall_cols[-1]],y)
    pickle.dump(model, open('model' + str(count) + '.sav', 'wb'))
    count = count+1
    
    iter_error.clear()
    iter_cols.clear()
    iter_labels.clear()
    
    for params in linear_hyper:
        error,cols,labels = stepwise(Ridge(alpha=params,max_iter=5000),x,y,state)
        iter_error.append(error)
        iter_cols.append(cols)
        iter_labels.append(labels)
    
    overall_errors.append(iter_error[np.argmin(iter_error)])
    overall_cols.append(iter_cols[np.argmin(iter_error)])
    newdf[count] = iter_labels[np.argmin(iter_error)]
    model = Ridge(alpha = linear_hyper[np.argmin(iter_error)])
    model.fit(x[overall_cols[-1]],y)
    pickle.dump(model, open('model' + str(count) + '.sav', 'wb'))
    count = count+1
    
    iter_error.clear()
    iter_cols.clear()
    iter_labels.clear()
    
    for params in knn_hyper:
        error,cols,labels = stepwise(KNeighborsRegressor(n_neighbors = params),x,y,state)
        iter_error.append(error)
        iter_cols.append(cols)
        iter_labels.append(labels)
    
    overall_errors.append(iter_error[np.argmin(iter_error)])
    overall_cols.append(iter_cols[np.argmin(iter_error)])
    newdf[count] = iter_labels[np.argmin(iter_error)]
    model = KNeighborsRegressor(n_neighbors = knn_hyper[np.argmin(iter_error)])
    model.fit(x[overall_cols[-1]],y)
    pickle.dump(model, open('model' + str(count) + '.sav', 'wb'))
    count = count+1
    
    error,cols,labels = stepwise(GradientBoostingRegressor(n_estimators=300),x,y,state)
    overall_errors.append(error)
    overall_cols.append(cols)
    newdf[count] = labels
    model = GradientBoostingRegressor(n_estimators=300)
    model.fit(x[cols],y)
    pickle.dump(model, open('model' + str(count) + '.sav', 'wb'))
    count = count + 1
    
    model = RandomForestRegressor(n_estimators=300,random_state=50,oob_score=True)
    model.fit(x,y)
    pickle.dump(model, open('model' + str(count) + '.sav', 'wb'))
    newdf[count] = model.oob_prediction_
    overall_errors.append(np.sqrt(mean_squared_error(y,model.oob_prediction_)))
    overall_cols.append(list(x.columns))
    count = count + 1
    
    return overall_errors,overall_cols,newdf,count
    
    

In [58]:
%%capture
l1_error,l1_cols,l1_stack_data,l1_final_count = stepgrid(final_features,target,50,1)
l2_error,l2_cols,l2_stack_data,l2_final_count = stepgrid(l1_stack_data,target,50,l1_final_count)
l3_error,l3_cols,l3_stack_data,l3_final_count = stepgrid(l2_stack_data,target,50,l2_final_count)
l4_error,l4_cols,l4_stack_data,l4_final_count = stepgrid(l3_stack_data,target,50,l3_final_count)
l5_error,l5_cols,l5_stack_data,l5_final_count = stepgrid(l4_stack_data,target,50,l4_final_count)
l6_error,l6_cols,l6_stack_data,l6_final_count = stepgrid(l5_stack_data,target,50,l5_final_count)
l7_error,l7_cols,l7_stack_data,l7_final_count = stepgrid(l6_stack_data,target,50,l6_final_count)
l8_error,l8_cols,l8_stack_data,l8_final_count = stepgrid(l7_stack_data,target,50,l7_final_count)
l9_error,l9_cols,l9_stack_data,l9_final_count = stepgrid(l8_stack_data,target,50,l8_final_count)
l10_error,l10_cols,l10_stack_data,l10_final_count = stepgrid(l9_stack_data,target,50,l9_final_count)

In [59]:
iter_count = 1
for errors in [l1_error,l2_error,l3_error,l4_error,l5_error,l6_error,l7_error,l8_error,l9_error,l10_error]:
    print('\nl' + str(iter_count) + ' errors')
    print(errors)
    iter_count = iter_count + 1


l1 errors
[7.560866118939369, 7.560657719933817, 7.559533822943492, 7.542515834543434, 4.856888118021668, 5.119547011446883]

l2 errors
[4.664097339516428, 4.664097349570139, 4.664036675131958, 4.7598583922916005, 4.9136790862620705, 4.859859552370659]

l3 errors
[4.637271381138415, 4.637271361236514, 4.633038444506085, 4.786948473606975, 5.057560483101893, 4.9429421624682925]

l4 errors
[4.622173613355311, 4.632963292716599, 4.622976186351388, 4.815501086989686, 5.148659729577785, 4.983328112534901]

l5 errors
[4.6448905331940535, 4.644873862643049, 4.644831131304173, 4.692092464873114, 4.9236022251568725, 4.698541423913321]

l6 errors
[4.577836298670544, 4.575965099812452, 4.574694962445255, 4.647138178019565, 4.778963819809738, 4.737592949761634]

l7 errors
[4.559943640553044, 4.558788895627184, 4.554173800856051, 4.645458511201755, 4.92422960466969, 4.849359913541625]

l8 errors
[4.550823241874569, 4.558645362199886, 4.550823251696895, 4.704500746383964, 4.982182047269817, 4.84200

In [60]:
feature_stats = pd.read_csv('Concrete_Data_Yeh.csv')
r1 = feature_stats.min()
r2 = feature_stats.max()
feature_stats.iloc[0] = r1
feature_stats.iloc[1] = r2
feature_stats = feature_stats.iloc[:2,:-1]

In [61]:
scaler = pickle.load(open('scaler.sav', 'rb'))
pca = pickle.load(open('pca.sav', 'rb'))
feature_engineering = [feature_stats,list(combinations(feature_stats.columns[:-1],2)),scaler,pca]

In [62]:
cement = 1
slag = 1
flyash = 1
water = 1
superplasticizer = 1
coarseaggregate = 1
fineaggregate = 1
age = 1

newdf = pd.concat([feature_stats,pd.DataFrame([cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age],index=feature_stats.columns).T],axis=0)

for a,b in feature_engineering[1]:
    if newdf[a].min() != 0.0 and newdf[b].min() != 0:
        newdf[str(a) + '/' + str(b)] = newdf[a]/newdf[b]
    elif newdf[a].min() == 0 and newdf[b].min() != 0:
        newdf[str(a) + '/' + str(b)] = newdf[a]/newdf[b]
    elif df[a].min() != 0 and df[b].min() == 0:
        newdf[str(b) + '/' + str(a)] = newdf[b]/newdf[a]
        
for cols in newdf.columns:
    if newdf[cols].min() != 0:
        newdf['1/('+str(cols) + ')'] = 1/newdf[cols]
        newdf['ln ' + str(cols)] = np.log(newdf[cols])

newdf = newdf.iloc[-1]
newdf = pd.DataFrame(feature_engineering[3].transform(feature_engineering[2].transform(pd.DataFrame(newdf).T)))

In [63]:
newdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,882.67093,-377.499459,3220.858268,1789.183529,2942.040168,435.225785,1803.601962,589.989741,2619.137144,3428.552845


In [64]:
pickle.dump(feature_engineering,open('feature_engineering.sav','wb'))

In [65]:
modellist = []
for num in range(1,44):
    modellist.append(pickle.load(open('model' + str(num) + '.sav', 'rb')))

In [66]:
l1_models = modellist[:6]
l2_models = modellist[6:12]
l3_models = modellist[12:18]
l4_models = modellist[18:24]
l5_models = modellist[24:30]
l6_models = modellist[30:36]
l7_models = modellist[36:42]
l8_models = modellist[-1]

l8_cols = l8_cols[0]

pickle.dump(l1_models,open('l1_models.sav','wb'))
pickle.dump(l2_models,open('l2_models.sav','wb'))
pickle.dump(l3_models,open('l3_models.sav','wb'))
pickle.dump(l4_models,open('l4_models.sav','wb'))
pickle.dump(l5_models,open('l5_models.sav','wb'))
pickle.dump(l6_models,open('l6_models.sav','wb'))
pickle.dump(l7_models,open('l7_models.sav','wb'))
pickle.dump(l8_models,open('l8_models.sav','wb'))

pickle.dump(l1_cols,open('l1_cols.sav','wb'))
pickle.dump(l2_cols,open('l2_cols.sav','wb'))
pickle.dump(l3_cols,open('l3_cols.sav','wb'))
pickle.dump(l4_cols,open('l4_cols.sav','wb'))
pickle.dump(l5_cols,open('l5_cols.sav','wb'))
pickle.dump(l6_cols,open('l6_cols.sav','wb'))
pickle.dump(l7_cols,open('l7_cols.sav','wb'))
pickle.dump(l8_cols,open('l8_cols.sav','wb'))

In [67]:
layers = [l1_models,l2_models,l3_models,l4_models,l5_models,l6_models,l7_models,l8_models]
layer_cols = [l1_cols,l2_cols,l3_cols,l4_cols,l5_cols,l6_cols,l7_cols,l8_cols]
count = 1
for num in range(8):
    preds = []
    if num != 7:
        for num1 in range(6):
            preds.append(layers[num][num1].predict(newdf[layer_cols[num][num1]]))
            
        newdf = pd.DataFrame(preds).T
        newdf.columns = np.arange(count,count+6)
        count = count+6
    
    else:
        preds = layers[7].predict(newdf[layer_cols[7]])
        preds = preds[0]
    

In [69]:
preds = preds[0]
preds

26.73085688995063

In [70]:
import os
for num in range(1,61):   
    if os.path.exists("model" + str(num) + '.sav'):
      os.remove("model" + str(num) + '.sav')
    else:
      print("The file does not exist")

os.remove("pca.sav")
os.remove("scaler.sav")

--- Addition Logic for streamlit plotly vis

In [190]:
import plotly.graph_objects as go
import plotly.express as px

In [191]:
df = pd.read_csv('Concrete_Data_Yeh.csv')

In [192]:
newfeats = df.iloc[:,:-1].copy()
newlabels = df.iloc[:,-1].copy()

In [193]:
newfeats

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360
...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28


In [194]:
newscaler = StandardScaler()
newscaler.fit(newfeats)
pickle.dump(newscaler,open('viz_scaler.sav','wb'))

In [195]:
newfeats = newscaler.transform(newfeats)

In [196]:
newfeats

array([[ 2.47791487, -0.85688789, -0.84714393, ...,  0.86315424,
        -1.21767004, -0.27973311],
       [ 2.47791487, -0.85688789, -0.84714393, ...,  1.05616419,
        -1.21767004, -0.27973311],
       [ 0.49142531,  0.79552649, -0.84714393, ..., -0.52651741,
        -2.24091709,  3.55306569],
       ...,
       [-1.27008832,  0.75957923,  0.85063487, ..., -1.03606368,
         0.0801067 , -0.27973311],
       [-1.16860982,  1.30806485, -0.84714393, ...,  0.21464081,
         0.19116644, -0.27973311],
       [-0.19403325,  0.30849909,  0.3769452 , ..., -1.39506219,
        -0.15074782, -0.27973311]])

In [197]:
newpca = PCA(n_components=2)
newpca.fit(newfeats)
pickle.dump(newpca,open('viz_pca.sav','wb'))
newfeats = pd.DataFrame(newpca.transform(newfeats))

In [198]:
newfeats

Unnamed: 0,0,1
0,0.679326,-1.457856
1,0.686646,-1.579441
2,4.155803,0.360341
3,4.594366,0.170789
4,2.363330,-0.275733
...,...,...
1025,-0.524788,1.252192
1026,-0.834653,0.747159
1027,-0.184897,1.253139
1028,-0.283700,1.274286


In [199]:
newfeats = pd.concat([newfeats,df],axis=1)

In [200]:
newfeats

Unnamed: 0,0,1,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,0.679326,-1.457856,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,0.686646,-1.579441,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,4.155803,0.360341,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,4.594366,0.170789,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,2.363330,-0.275733,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...,...,...
1025,-0.524788,1.252192,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,-0.834653,0.747159,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,-0.184897,1.253139,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,-0.283700,1.274286,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [201]:
newfeats['Hover Name'] = newfeats['csMPa'].apply(lambda x:'Compressive Strength: ' + str(x) + ' MPa')

In [202]:
newfeats.columns = ['Component 1','Component 2','cement','slag','flyash','water',
                    'superplasticizer','coarseaggregate','fineaggregate','age'
                    ,'csMPa','Hover Name']

In [203]:
newfeats['label'] = newfeats.apply(lambda x:
                                  '<b>' + x['Hover Name'] + '</b><br><br><i>Mixture Recipe</i><br>'+
                                   'Cement: ' + str(x['cement']) +
                                   ' Kg<br>Slag: ' + str(x['slag']) +
                                   ' Kg<br>Fly Ash: ' + str(x['slag']) +
                                   ' Kg<br>Water: ' + str(x['slag']) +
                                   ' Kg<br>Super Plasticizer: ' + str(x['slag']) +
                                   ' Kg<br>Coarse Aggregate: ' + str(x['slag']) +
                                   ' Kg<br>Fine Aggregate: ' + str(x['slag']) +
                                   '<br>Age: ' + str(x['slag']) + ' Days',axis=1)

In [204]:
newfeats

Unnamed: 0,Component 1,Component 2,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa,Hover Name,label
0,0.679326,-1.457856,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99,Compressive Strength: 79.99 MPa,<b>Compressive Strength: 79.99 MPa</b><br><br>...
1,0.686646,-1.579441,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89,Compressive Strength: 61.89 MPa,<b>Compressive Strength: 61.89 MPa</b><br><br>...
2,4.155803,0.360341,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27,Compressive Strength: 40.27 MPa,<b>Compressive Strength: 40.27 MPa</b><br><br>...
3,4.594366,0.170789,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05,Compressive Strength: 41.05 MPa,<b>Compressive Strength: 41.05 MPa</b><br><br>...
4,2.363330,-0.275733,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30,Compressive Strength: 44.3 MPa,<b>Compressive Strength: 44.3 MPa</b><br><br><...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025,-0.524788,1.252192,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28,Compressive Strength: 44.28 MPa,<b>Compressive Strength: 44.28 MPa</b><br><br>...
1026,-0.834653,0.747159,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18,Compressive Strength: 31.18 MPa,<b>Compressive Strength: 31.18 MPa</b><br><br>...
1027,-0.184897,1.253139,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70,Compressive Strength: 23.7 MPa,<b>Compressive Strength: 23.7 MPa</b><br><br><...
1028,-0.283700,1.274286,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77,Compressive Strength: 32.77 MPa,<b>Compressive Strength: 32.77 MPa</b><br><br>...


In [209]:
fig = px.scatter(data_frame=newfeats,x='Component 1',y='Component 2',color = 'csMPa',hover_data=newfeats.iloc[:,2:-2],
                 hover_name='Hover Name',color_continuous_scale='RdYlGn')

fig.update_layout(coloraxis_colorbar=dict(
    title="MPa",
),
    title='Mixture Development: All Results (Hover Over Data Points to View MPa and Mixture)',
    xaxis=dict(
        title='Component 1',
        gridcolor='white',
        gridwidth=2,
    ),
    yaxis=dict(
        title='Component 2',
        gridcolor='white',
        gridwidth=2))

fig.update_traces(textfont_size = 2,hovertemplate = newfeats['label'])