# Feature importance

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
from sklearn.linear_model import LinearRegression
from sklearn.multioutput  import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from xgboost.core import XGBoostError
from xgboost              import XGBRegressor
from sklearn.tree         import ExtraTreeRegressor
from sklearn.tree         import DecisionTreeRegressor
from sklearn.ensemble     import ExtraTreesRegressor
from sklearn.ensemble     import GradientBoostingRegressor
from sklearn.ensemble     import AdaBoostRegressor
from sklearn.ensemble     import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score, mean_absolute_error

In [2]:
df_APEC_total = pd.read_excel('./data/df_APEC_total.xlsx',index_col=0).fillna(0)
df = df_APEC_total
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
df.columns = [regex.sub("", col) if any(x in str(col) for x in set(('[', ']', '<'))) 
              else col for col in df.columns.values]
X = df.drop(['Part weight','Length','Angle', 'Height', 'Width'], axis=1).fillna(0)
y = df[['Part weight']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
data = [
    go.Heatmap(
        z= df.corr().values,
        x= df.columns.values,
        y= df.columns.values,
        colorscale='Viridis',
        opacity = 1.0
    )
]

layout = go.Layout(
    title='Pearson Correlation',
    xaxis = dict(ticks='', nticks=30),
    yaxis = dict(ticks='' ),
    width = 1000, height = 1000, 
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Housedatacorr')

In [4]:
def get_feature_importances(Regressor, X_train, y_train=None,
                             top_n=10, figsize=(8, 8), enable_plot=True, print_table=False, title="Feature Importances"):


    __name__ = "get_feature_importances"


    try:
        if not hasattr(Regressor, 'feature_importances_'):
            Regressor.fit(X_train.values, y_train.values.ravel())

            if not hasattr(Regressor, 'feature_importances_'):
                raise AttributeError("{} does not have feature_importances_ attribute".
                                     format(Regressor.__class__.__name__))

    except (XGBoostError, ValueError):
        Regressor.fit(X_train.values, y_train.values.ravel())

    feat_imp = pd.DataFrame({title : Regressor.feature_importances_})
    # feat_imp = pd.DataFrame(Regressor.feature_importances_, columns=[title])
    feat_imp['feature'] = X_train.columns
    feat_imp.sort_values(by= title, ascending=False, inplace=True)
    feat_imp = feat_imp.iloc[:top_n]

    feat_imp.sort_values(by= title, inplace=True, ascending=False)
    pltdf = feat_imp.set_index('feature', drop=True)

    if enable_plot:
        pltdf.plot.barh(title=title, figsize=figsize)
        plt.xlabel('Feature Importance Score')
        plt.show()

    if print_table:
        from IPython.display import display
        print("Top {} features in descending order of importance".format(top_n))
        display(pltdf.sort_values(by= title, ascending=False))

    return feat_imp, title

In [5]:
Regressors = [XGBRegressor(),
        ExtraTreeRegressor(),
        DecisionTreeRegressor(),
        ExtraTreesRegressor(),
        GradientBoostingRegressor(), 
        AdaBoostRegressor(),
        RandomForestRegressor()]

result = pd.DataFrame()
for Regressor in Regressors:
    try:
        fi, Regressor_name = get_feature_importances(Regressor, X_train, y_train,enable_plot=False, top_n=X_train.shape[1], title=Regressor.__class__.__name__)
        if result.empty:
            result=fi
        else:
            result = pd.merge(result, fi, on='feature')
    except AttributeError as e:
        print(e)

result = result.set_index('feature', drop=True)
result['Row_sum'] = result.apply(lambda x: x.sum(), axis=1)
result.sort_values("Row_sum", inplace=True, ascending=False)
result.loc['Col_sum'] = result.apply(lambda x: x.sum())
result

Unnamed: 0_level_0,XGBRegressor,ExtraTreeRegressor,DecisionTreeRegressor,ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor,RandomForestRegressor,Row_sum
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Solid density,0.135526,0.559801,0.5597696,0.059553,0.327114,0.3173,0.136585,2.095649
Holding pressure,0.03639,0.164868,0.1553031,0.173304,0.174282,0.048909,0.16548,0.918537
Thermal Conductivity (Lambda) - Constant,0.578097,0.029423,2.570777e-05,0.01639,0.005191,0.036655,0.012707,0.678487
No Flow Temp,0.094641,0.001052,3.522141e-05,0.10066,0.041214,0.12222,0.16919,0.529012
Ejection Temp,0.000267,0.002671,4.604589e-08,0.12441,0.081097,0.060823,0.13169,0.40096
P2,1.4e-05,0.0,4.022003e-05,0.038605,0.114228,0.056133,0.148987,0.358007
Melt density,0.007357,0.000543,0.0,0.116014,0.001267,0.111581,0.002247,0.239009
Ps2,0.0,0.0,0.1635676,0.006059,0.0,0.013441,0.005956,0.189023
Ps4,0.0,0.163273,0.0,0.006272,0.0,0.0,0.009496,0.179041
Thermal diffusivity K=lambda/(rho*Cp),0.054467,0.001105,0.006219798,0.018675,0.023212,0.010999,0.011024,0.125701


In [6]:
result.index

Index(['Solid density', 'Holding pressure',
       'Thermal Conductivity (Lambda) - Constant', 'No Flow Temp',
       'Ejection Temp', 'P2', 'Melt density', 'Ps2', 'Ps4',
       'Thermal diffusivity K=lambda/(rho*Cp) ', 'Melt Volume Rate  (MVR)',
       'Melt temperature', 'P1', 'Injection flow rate', 'Suggested Mold Temp',
       'Maximum Mold Temp', 'Specific Heat (Cp) - Constant',
       'N0 - Nu Polynom //', 'P3', 'Ts', 'Minimum Mold Temp', 'Pt1',
       'Cavity wall temperature', 'Cooling time',
       'Melt Volume Rate  (MFR/MFI)', 'T0', 'E0 - E Polynom //',
       'E Constant //', 'Pm1', 'Pm2', 'Ps1', 'Pm3', 'E2 - E Polynom //', 'Pm4',
       'CTE constant //', 'E1 - E Polynom //', 'Pt2', 'Minimum Melt Temp',
       'E3 - E Polynom //', 'Ps3', 'Suggested Melt Temp',
       'Holding pressure time', 'Load  (MFR/MFI)', 'Load  (MVR)',
       'Temperature  (MVR)', 'Temperature  (MFR/MFI)', 'Ps5', 'Ps6', 'Ps7',
       'Maximum Melt Temp', 'Col_sum'],
      dtype='object', name='featur

In [7]:
corr = df.drop(['Length','Angle', 'Height', 'Width'], axis=1).fillna(0).corr()
corr.style.background_gradient()
corr.reindex(corr['Part weight'].abs().sort_values(ascending=False).index)

Unnamed: 0,Part weight,Melt temperature,Cavity wall temperature,Holding pressure time,Cooling time,Injection flow rate,Holding pressure,Maximum Melt Temp,Suggested Melt Temp,Minimum Melt Temp,...,Ps7,Pt1,Pt2,E Constant //,E0 - E Polynom //,E1 - E Polynom //,E2 - E Polynom //,E3 - E Polynom //,N0 - Nu Polynom //,CTE constant //
Part weight,1.0,-0.2565238,-0.6286566,0.00379569,-0.01227194,-0.02877347,0.3649034,,-0.6560391,-0.6560391,...,,-0.1024144,-0.1024144,0.1291476,-0.09210815,0.04786744,-0.001158083,-0.05127739,0.04635539,0.04414475
Solid density,0.77645,-0.3451324,-0.7897497,4.204431e-18,2.290552e-17,-3.098758e-08,5.2038870000000004e-17,,-0.8788836,-0.8788836,...,,-0.05469616,-0.05469616,0.2319601,-0.03723285,-0.03699607,0.1142844,-0.1995187,0.02294731,0.01651801
Minimum Mold Temp,-0.760794,0.3498397,0.8115574,-2.5913310000000003e-17,2.5509900000000003e-17,8.717195e-08,1.285395e-16,,0.8908708,0.8908708,...,,0.1202813,0.1202813,-0.345914,0.1015916,-0.0234989,-0.05642664,0.1452566,-0.105417,-0.08074591
Suggested Mold Temp,-0.757719,0.3454139,0.8166264,-2.620997e-17,-3.348959e-17,6.872604e-08,-1.265301e-16,,0.8796006,0.8796006,...,,0.1373606,0.1373606,-0.3216061,0.1188329,-0.04109248,-0.03888502,0.1280205,-0.1230292,-0.09966806
Ejection Temp,-0.757309,0.3218766,0.8033836,-1.379572e-17,-3.549307e-17,2.759978e-08,1.2614060000000001e-17,,0.8196624,0.8196624,...,,0.2147815,0.2147815,-0.3000018,0.1978294,-0.1254513,0.04936716,0.03661977,-0.1955252,-0.1773827
Maximum Mold Temp,-0.74694,0.3377845,0.8125396,7.929840000000001e-17,-2.2882680000000003e-17,5.143125e-08,1.292951e-16,,0.8601721,0.8601721,...,,0.1512475,0.1512475,-0.2963709,0.1330614,-0.05645989,-0.02271815,0.1111851,-0.1375464,-0.115609
Thermal diffusivity K=lambda/(rho*Cp),-0.722999,0.1308616,0.4064617,-1.4902400000000002e-17,1.743926e-17,5.149912e-08,-2.5750430000000002e-17,,0.3332407,0.3332407,...,,0.4363229,0.4363229,-0.1159245,0.4348829,-0.4260202,0.4127362,-0.3923929,-0.3711315,-0.3663617
Suggested Melt Temp,-0.656039,0.3926941,0.718305,4.1911350000000003e-17,6.774736000000001e-17,5.353094e-08,-1.72705e-16,,1.0,1.0,...,,0.05143445,0.05143445,-0.253792,0.0349566,0.03782545,-0.1167025,0.2038867,-0.04111862,-0.02335982
Minimum Melt Temp,-0.656039,0.3926941,0.718305,4.1911350000000003e-17,6.774736000000001e-17,5.353094e-08,-1.72705e-16,,1.0,1.0,...,,0.05143445,0.05143445,-0.253792,0.0349566,0.03782545,-0.1167025,0.2038867,-0.04111862,-0.02335982
P2,-0.653816,0.1545122,0.5234182,-4.843983e-20,-8.691659e-18,-5.476622e-08,-8.781856e-17,,0.3934672,0.3934672,...,,0.6279971,0.6279971,-0.1852382,0.620006,-0.5822048,0.5371753,-0.4795656,-0.5856438,-0.5820967


In [8]:
Regressors = [XGBRegressor(),
        ExtraTreeRegressor(),
        DecisionTreeRegressor(),
        ExtraTreesRegressor(),
        GradientBoostingRegressor(), 
        AdaBoostRegressor(),
        RandomForestRegressor()]


In [9]:
dr = DecisionTreeRegressor()
dr.fit(X_train,y_train.values.ravel())
drimp = dr.feature_importances_

In [10]:
rfr = RandomForestRegressor(n_estimators=100)
rfr.fit(X_train,y_train.values.ravel())
rfrimp = rfr.feature_importances_

In [11]:
gbr =  GradientBoostingRegressor(n_estimators=100)
gbr.fit(X_train,y_train.values.ravel())
gbrimp = gbr.feature_importances_

In [12]:
abr =  AdaBoostRegressor(n_estimators=100)
abr.fit(X_train,y_train.values.ravel())
abrimp = abr.feature_importances_

In [13]:
etr =  ExtraTreesRegressor(n_estimators=100)
etr.fit(X_train,y_train.values.ravel())
etrimp = etr.feature_importances_

In [14]:
xgb = XGBRegressor()
xgb.fit(X_train,y_train)
xrimp = xgb.feature_importances_

In [15]:
d = {'Decision Tree':drimp, 'Random Forest':rfrimp, 'Gradient Boost':gbrimp,'Ada boost':abrimp, 'Extra Tree':etrimp, 'XGBoost':xrimp}
features = pd.DataFrame(data = d)
features['mean'] = features.mean(axis= 1) 
features['names'] = X.columns.values
features.head()

Unnamed: 0,Decision Tree,Random Forest,Gradient Boost,Ada boost,Extra Tree,XGBoost,mean,names
0,0.003711,0.007189,0.007932,0.086176,0.005721,0.000836,0.018594,Melt temperature
1,0.008923,0.006516,0.00625,0.005879,0.008805,0.001204,0.006263,Cavity wall temperature
2,3.5e-05,0.000429,3e-06,0.000142,0.00111,0.000101,0.000303,Holding pressure time
3,0.006108,0.009541,0.008509,0.001807,0.007029,0.0018,0.005799,Cooling time
4,0.00107,0.012897,0.012854,0.000561,0.00648,0.00263,0.006082,Injection flow rate


In [16]:
y = features['mean'].values
x = features['names'].values
data = [go.Bar(
            x= x,
            y= y,
            width = 0.5,
            marker=dict(
               color = features['mean'].values,
            colorscale='Portland',
            showscale=True,
            reversescale = False
            ),
            opacity=0.6
        )]

layout= go.Layout(
    autosize= True,
    title= 'Mean Feature Importance',
    hovermode= 'closest',
    yaxis=dict(
        title= 'Feature Importance for Inject Moulding',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='barplothouse')

# Model Comparision

In [17]:
y_train = df[['Part weight']].values
X_train = df.drop(['Part weight','Length','Angle', 'Height', 'Width'], axis=1)
#Init base model as regressor
regressors = DecisionTreeRegressor(random_state=12)
#Init pipe
pipe = Pipeline([('model', regressors)])

def regression_metrics(name,x,y, y_hat):
    mae = mean_absolute_error(y, y_hat)
    mse = np.mean(np.square((y - y_hat)))
    rmse = np.sqrt(mse)
    r2 = r2_score(y, y_hat)
    adj_r2 = 1 - (1-r2)*(len(y_hat)-1)/(len(y_hat)-x.shape[1]-1)
    print('{} Evaluation Metrics:'.format(name))
    print('Adjusted R2: {}\nMAE: {} \nMSE: {}\nRMSE: {}'.format(adj_r2,mae,mse,rmse))
    print('==================================================')
    return adj_r2, mae, mse, rmse

In [18]:
#Create a list of model names to be used in a print funtion later when aggregating model evaluation metrics
model_names = [
    'Decision Tree',
    'Random Forest',
    'Gradient Boosting',
    'Ada Boost',
    'Extra Trees',
    'XGBoost',
]

#Init models into a list to loop over in pipeline to evaluate all base model performances
regressors = [
    DecisionTreeRegressor(random_state = 42),
    RandomForestRegressor(random_state = 42, n_jobs=-1),
    GradientBoostingRegressor(random_state = 42,),
    AdaBoostRegressor(random_state = 42,),
    ExtraTreesRegressor(random_state = 42, n_jobs=-1),
    XGBRegressor(random_state = 42, n_jobs=-1)
]
#Implement pipe that tests evaluation metrics for all regressors
df_metrics = pd.DataFrame(columns=['R2', 'MAE', 'MSE','RMSE'])
for name,regressor in zip(model_names, regressors):
    pipe.steps.pop(0) #Delete previous model in pipeline
    pipe.steps.append(['model',regressor]) #Add new model

    predictions = cross_val_predict(pipe, X_train, y_train, n_jobs=-1)
 
    #Evaluate
    adj_r2, mae, mse, rmse = regression_metrics(name, X_train, y_train, predictions)
    df_results = pd.DataFrame([adj_r2, mae, mse, rmse]).T
    df_results.index =pd.Series([name])
    df_results.columns =pd.Series(['R2', 'MAE', 'MSE','RMSE'])
    df_metrics = pd.concat([df_metrics, df_results])
    
   # df_metrics = pd.merge(df_metrics, pd.DataFrame([adj_r2, mae, mse, rmse]).T)

Decision Tree Evaluation Metrics:
Adjusted R2: 0.7338818984157496
MAE: 0.4250149850149817 
MSE: 2.8958147190970656
RMSE: 1.7017093521212914
Random Forest Evaluation Metrics:
Adjusted R2: 0.6243319441291829
MAE: 0.4610852557572283 
MSE: 2.7872335867851064
RMSE: 1.6695009993363605
Gradient Boosting Evaluation Metrics:
Adjusted R2: 0.8286569268226628
MAE: 0.32974951589325624 
MSE: 2.5635777488444975
RMSE: 1.6011176561528817
Ada Boost Evaluation Metrics:
Adjusted R2: 0.6766944180424947
MAE: 0.5397949997085602 
MSE: 2.484488922228891
RMSE: 1.5762261646822422
Extra Trees Evaluation Metrics:
Adjusted R2: 0.5090404427430081
MAE: 0.4808434765234778 
MSE: 2.8925180272515463
RMSE: 1.7007404350022217
XGBoost Evaluation Metrics:
Adjusted R2: 0.8479866947603514
MAE: 0.30919000238043043 
MSE: 2.5382063544264137
RMSE: 1.593174929010124


In [19]:
df_metrics

Unnamed: 0,R2,MAE,MSE,RMSE
Decision Tree,0.733882,0.425015,2.895815,1.701709
Random Forest,0.624332,0.461085,2.787234,1.669501
Gradient Boosting,0.828657,0.32975,2.563578,1.601118
Ada Boost,0.676694,0.539795,2.484489,1.576226
Extra Trees,0.50904,0.480843,2.892518,1.70074
XGBoost,0.847987,0.30919,2.538206,1.593175


In [20]:
y = df_metrics['R2'].values
x = df_metrics.index
data = [go.Bar(
            x= x,
            y= y,
            width = 0.5,
            marker=dict(
               color = df_metrics['R2'].values,
            colorscale='Portland',
            showscale=True,
            reversescale = False
            ),
            opacity=0.6
        )]

layout= go.Layout(
    autosize= True,
    title= 'R2 Comparision',
    hovermode= 'closest',
    yaxis=dict(
        title= 'R2 Value',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='barplothouse')

In [3]:
df_APEC_total = pd.read_excel('./data/df_APEC_total.xlsx',index_col=0).fillna(0)
data = df_APEC_total
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
data.columns = [regex.sub("", col) if any(x in str(col) for x in set(('[', ']', '<'))) 
              else col for col in data.columns.values]
train_dataset = data.sample(frac=0.8,random_state=42)
test_dataset = data.drop(train_dataset.index)

train_stats = train_dataset.describe()
train_stats = train_stats.drop(['Part weight','Length','Angle', 'Height', 'Width'], axis=1)
train_stats = train_stats.transpose()
train_labels = train_dataset[['Part weight']]
test_labels = test_dataset[['Part weight']]

def norm(x):
    return (x - train_stats['mean']) / train_stats['std'] #zero-mean normalization, same with sklearn.preprocessing.StandardScaler
train_dataset = train_dataset.drop(['Part weight','Length','Angle','Height', 'Width'], axis=1)
test_dataset = test_dataset.drop(['Part weight','Length','Angle','Height', 'Width'], axis=1)
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

X_train = normed_train_data.fillna(0)
X_test = normed_test_data.fillna(0)
y_train = train_labels.values
y_test = test_labels.values

In [4]:
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping
from keras.models import Sequential, Input, Model
from keras.layers import Dense
from sklearn.model_selection import cross_validate, KFold
# create model
def MLP1(feat_num, loss):
    net = Sequential()
    net.add(Dense(50, input_dim=feat_num, kernel_initializer='normal', activation='relu'))
    net.add(Dense(25, kernel_initializer='normal', activation='relu'))
    net.add(Dense(10, kernel_initializer='normal', activation='relu'))
    net.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    net.compile(loss=loss, optimizer='adam')
    return net
# fix random seed for reproducibility
seed = 42
np.random.seed(seed)
mol_num, feat_num = X_train.shape
print("Number of materials for training = %i, Number of features = %i\n" % (mol_num, feat_num))

Using TensorFlow backend.


Number of materials for training = 801, Number of features = 50



In [23]:
estimator = KerasRegressor(build_fn=MLP1, 
                           feat_num=feat_num, 
                           loss='mean_absolute_error', 
                           epochs=300, 
                           batch_size=int(X_train.shape[0]/8), 
                           verbose=0)
df_base_B= pd.DataFrame(columns=['fit_time', 'score_time', 'test_r2','test_neg_mean_absolute_error'])
df_base_B_result = pd.DataFrame(columns=['R2_mean', 'R2_std',
                                                   'neg_MAE_mean','neg_MAE_std'])

kfold = KFold(n_splits=5, random_state=seed, shuffle=True)

scores = cross_validate(estimator, X_train, y_train, scoring=('r2', 'neg_mean_absolute_error'), cv=kfold, return_train_score=False)

df_scores = pd.DataFrame(scores)
df_scores.index =pd.Series(['Fold1', 'Fold2', 'Fold3', 'Fold4','Fold5'])

df_results = pd.DataFrame([scores['test_r2'].mean(), scores['test_r2'].std(), 
                        scores['test_neg_mean_absolute_error'].mean(), 
                        scores['test_neg_mean_absolute_error'].std()]).T
df_results.index =pd.Series(['base_B'])
df_results.columns =pd.Series(['R2_mean', 'R2_std','neg_MAE_mean','neg_MAE_std'])

df_base_B = pd.concat([df_base_B, df_scores])
df_base_B_result = pd.concat([df_base_B_result, df_results], axis=0)
print(df_scores)
print(df_results)


       fit_time  score_time   test_r2  test_neg_mean_absolute_error
Fold1  7.862215    0.098763  0.929585                     -0.163411
Fold2  6.613748    0.095066  0.931235                     -0.270945
Fold3  6.893503    0.095811  0.917756                     -0.190485
Fold4  6.950672    0.094567  0.934446                     -0.174153
Fold5  7.386946    0.117389  0.878561                     -0.318137
         R2_mean    R2_std  neg_MAE_mean  neg_MAE_std
base_B  0.918317  0.020663     -0.223426     0.060569


In [25]:
df_results.index = pd.Series(['Neural Network'])
df_results_with_NN = pd.concat([df_results['R2_mean'], df_metrics['R2']])
y = df_results_with_NN.values
x = df_results_with_NN.index
data = [go.Bar(
            x= x,
            y= y,
            width = 0.5,
            marker=dict(
               color = df_results_with_NN.values,
            colorscale='Portland',
            showscale=True,
            reversescale = False
            ),
            opacity=0.6
        )]

layout= go.Layout(
    autosize= True,
    title= 'R2 Comparision',
    hovermode= 'closest',
    yaxis=dict(
        title= 'R2 Value',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='barplothouse')

In [26]:
df_results_with_NN

Neural Network       0.918317
Decision Tree        0.733882
Random Forest        0.624332
Gradient Boosting    0.828657
Ada Boost            0.676694
Extra Trees          0.509040
XGBoost              0.847987
dtype: float64

In [41]:
df_scores

Unnamed: 0,fit_time,score_time,test_r2,test_neg_mean_absolute_error
Fold1,7.626569,0.091869,0.93703,-0.196508
Fold2,8.0658,0.094166,0.9379,-0.22978
Fold3,8.578331,0.133029,0.922128,-0.233732
Fold4,7.448132,0.097288,0.930376,-0.175547
Fold5,7.476583,0.130948,0.936721,-0.175952


# Base Case DNN Model

In [8]:
net = MLP1(feat_num=feat_num, 
           loss='mean_absolute_error')
history = net.fit(X_train, 
        y_train,
        epochs=300, 
        batch_size=int(X_train.shape[0]/8), 
        verbose=0)
y_pred = net.predict(X_test)

In [19]:
def MAPE(y_hat, y):
    '''
    Mean Absolute Percentage Error Metric
    '''
    abserror = np.abs(y_hat - y)
    return np.mean(abserror / y)*100
def regression_metrics(name,x , y, y_hat):
    mae = mean_absolute_error(y, y_hat)
    mse = np.mean(np.square((y - y_hat)))
    rmse = np.sqrt(mse)
    r2 = r2_score(y, y_hat)
    adj_r2 = 1 - (1-r2)*(len(y_hat)-1)/(len(y_hat)-x.shape[1]-1)
    print('{} Evaluation Metrics:'.format(name))
    print('Adjusted R2: {}\nMAE: {} \nMSE: {}\nRMSE: {}'.format(adj_r2,mae,mse,rmse))
    print('==================================================')
    return adj_r2, mae, mse, rmse
adj_r2, mae, mse, rmse = regression_metrics('DNN', X_test, y_test, y_pred)
df_results_DNN = pd.DataFrame([adj_r2, mae, mse, rmse]).T
df_results_DNN.index =pd.Series(['DNN'])
df_results_DNN.columns =pd.Series(['R2', 'MAE', 'MSE','RMSE'])

DNN Evaluation Metrics:
Adjusted R2: 0.8707159652789993
MAE: 0.33530763168335864 
MSE: 0.1481964910692295
RMSE: 0.3849629736341269


# Optimized DNN Model

In [18]:
from tensorflow.python.keras.constraints import maxnorm
def optimized_model():
    net = Sequential()
    net.add(Dense(100, input_dim=feat_num, kernel_initializer='glorot_uniform', activation='softplus', kernel_constraint=maxnorm(4)))
    net.add(Dense(25, kernel_initializer='glorot_uniform', activation='softplus'))
    net.add(Dense(10, kernel_initializer='glorot_uniform', activation='softplus'))
    net.add(Dense(1, kernel_initializer='glorot_uniform'))
    # compile this model
    net.compile(loss='mean_absolute_error', 
                  optimizer='adam',
                 )
    return net
model = KerasRegressor(build_fn=optimized_model, epochs=300, batch_size=150, verbose=0)
history = model.fit(X_train, 
        y_train)
y_pred = model.predict(X_test)

In [21]:
adj_r2, mae, mse, rmse = regression_metrics('DNN_optimized', X_test, y_test, y_pred)
df_results_DNN_optimized = pd.DataFrame([adj_r2, mae, mse, rmse]).T
df_results_DNN_optimized.index =pd.Series(['DNN_optimized'])
df_results_DNN_optimized.columns =pd.Series(['R2', 'MAE', 'MSE','RMSE'])

DNN_optimized Evaluation Metrics:
Adjusted R2: 0.9071461234565261
MAE: 0.2280054098510702 
MSE: 3.2190973527298525
RMSE: 1.7941843140351696


In [22]:
df_results_DNN_optimized

Unnamed: 0,R2,MAE,MSE,RMSE
DNN_optimized,0.907146,0.228005,3.219097,1.794184


In [23]:
MAPE(y_test, y_pred)

2.35850645428462

# DNN-TL Model

In [22]:
# load the source dataset 
df_APEC_total=pd.read_excel('./data/df_APEC_total.xlsx',index_col=0)
df_PLEXIGLAS_total=pd.read_excel('./data/df_PLEXIGLAS_total.xlsx',index_col=0)
df_SABIC_total=pd.read_excel('./data/df_SABIC_total.xlsx',index_col=0)
df_ULTEM_total=pd.read_excel('./data/df_ULTEM_total.xlsx',index_col=0)
df_ULTRAMID_total=pd.read_excel('./data/df_ULTRAMID_total.xlsx',index_col=0)
df_VALOX_total=pd.read_excel('./data/df_VALOX_total.xlsx',index_col=0)
df = pd.concat([df_PLEXIGLAS_total, df_SABIC_total, df_ULTEM_total,
                df_ULTRAMID_total, df_VALOX_total])
data = df.fillna(0)
train_dataset = data.sample(frac=0.8,random_state=42)
test_dataset = data.drop(train_dataset.index)

train_stats = train_dataset.describe()
train_stats = train_stats.drop(['Part weight','Length','Angle', 'Height', 'Width'], axis=1)
train_stats = train_stats.transpose()
train_labels = train_dataset[['Part weight']]
test_labels = test_dataset[['Part weight']]

def norm(x):
    return (x - train_stats['mean']) / train_stats['std'] #zero-mean normalization, same with sklearn.preprocessing.StandardScaler
train_dataset = train_dataset.drop(['Part weight','Length','Angle','Height', 'Width'], axis=1)
test_dataset = test_dataset.drop(['Part weight','Length','Angle','Height', 'Width'], axis=1)
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

x_transf = np.array(normed_train_data.fillna(0), dtype=np.float)
y_transf = np.array(train_labels, dtype=np.float)
x_transf = x_transf.astype(float)
y_transf = y_transf.astype(float)

mol_num, feat_num = x_transf.shape
print("# molecules for transfer training = %i, # of features = %i\n" % (mol_num, feat_num))

# molecules for transfer training = 2895, # of features = 50



In [23]:
net = MLP1(feat_num=feat_num, 
           loss='mean_absolute_error')
net.fit(x_transf, 
        y_transf,
        epochs=300, 
        batch_size=int(x_transf.shape[0]/8), 
        verbose=0)
def transf_MLP(feat_num, idx, lhl_sizes, loss='mean_absolute_error'):
    global net  # net is a networks and cannot be pickled! Therefore it cannot be an input argument for cross_validate() to work!
    inp = Input(shape=(feat_num,))
    shared_layer = net.layers[0]
    shared_layer.trainable = True  # deactivate training in all re-used layers of MLP1
    out_tensor = shared_layer(inp)
    # idx = 1  # index of desired layer
    for i in range(1,idx+1):
        #print('Transfered layer %d' % i)
        shared_layer = net.layers[i]    # deactivate training in all re-used layers of MLP1
        shared_layer.trainable = True  # deactivate training in all re-used layers of MLP1
        out_tensor = shared_layer(out_tensor)
    # Here add all the new layers
    for l_size in lhl_sizes[idx:]:
        #print ('Rest layer size %d ' % l_size)
        out_tensor = Dense(l_size, kernel_initializer='normal', activation='relu')(out_tensor)
    # Close the network
    out_tensor = Dense(1, kernel_initializer='normal')(out_tensor)
    # Create the model
    transf_model = Model(inp, out_tensor)
    transf_model.compile(loss=loss, optimizer='adam')
    return transf_model

estimator = KerasRegressor(build_fn=transf_MLP,
                           feat_num=feat_num,
                           idx=3,
                           lhl_sizes=(50, 25, 10),
                           loss='mean_absolute_error',
                           epochs=300,
                           batch_size=int(X_train.shape[0]/8),
                           verbose=0)
history = estimator.fit(X_train, 
        y_train)

In [24]:
y_pred = estimator.predict(X_test)
adj_r2, mae, mse, rmse = regression_metrics('DNN_TL', X_test, y_test, y_pred)
df_results_DNN_TL = pd.DataFrame([adj_r2, mae, mse, rmse]).T
df_results_DNN_TL.index =pd.Series(['DNN_TL'])
df_results_DNN_TL.columns =pd.Series(['R2', 'MAE', 'MSE','RMSE'])

DNN_TL Evaluation Metrics:
Adjusted R2: 0.8934257691934733
MAE: 0.2650672558593752 
MSE: 3.0922587019362893
RMSE: 1.7584819310804105


In [25]:
df_results_DNN_TL

Unnamed: 0,R2,MAE,MSE,RMSE
DNN_TL,0.893426,0.265067,3.092259,1.758482


# DNN-EL Model

In [26]:
def MAPE(y_hat, y):
    '''
    Mean Absolute Percentage Error Metric
    '''
    abserror = np.abs(y_hat - y)
    return np.mean(abserror / y)*100
def regression_metrics(name,x , y, y_hat):
    mae = mean_absolute_error(y, y_hat)
    mse = np.mean(np.square((y - y_hat)))
    rmse = np.sqrt(mse)
    r2 = r2_score(y, y_hat)
    adj_r2 = 1 - (1-r2)*(len(y_hat)-1)/(len(y_hat)-x.shape[1]-1)
    print('{} Evaluation Metrics:'.format(name))
    print('Adjusted R2: {}\nMAE: {} \nMSE: {}\nRMSE: {}'.format(adj_r2,mae,mse,rmse))
    print('==================================================')
    return adj_r2, mae, mse, rmse

In [27]:
from keras.layers import Input, Dense, concatenate
def build_model():
    inputs = Input(shape=(feat_num, ))
    model1_1 = Dense(64, activation='relu')(inputs)
    model2_1 = Dense(50, activation='relu')(inputs)
    model3_1 = Dense(32, activation='relu')(inputs)
    model1_2 = Dense(32, activation='relu')(model1_1)
    model2_2 = Dense(25, activation='relu')(model2_1)
    model3_2 = Dense(16, activation='relu')(model3_1)
    model1_3 = Dense(16, activation='relu')(model1_2)
    model2_3 = Dense(10, activation='relu')(model2_2)
    model3_3 = Dense(8, activation='relu')(model3_2)
    con = concatenate([model1_3, model2_3, model3_3])
    output = Dense(1, activation='relu')(con)
    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam',
                  loss='mean_absolute_error')
    return model

model = KerasRegressor(build_fn=build_model, epochs=300, batch_size=int(X_train.shape[0]/8), verbose=0)
history = model.fit(X_train, 
        y_train)
y_pred = model.predict(X_test)

In [28]:
adj_r2, mae, mse, rmse = regression_metrics('DNN_EL', X_test, y_test, y_pred)
df_results_DNN_EL = pd.DataFrame([adj_r2, mae, mse, rmse]).T
df_results_DNN_EL.index =pd.Series(['DNN_EL'])
df_results_DNN_EL.columns =pd.Series(['R2', 'MAE', 'MSE','RMSE'])

DNN_EL Evaluation Metrics:
Adjusted R2: 0.8335910211252608
MAE: 0.36525455993652955 
MSE: 2.935402939507532
RMSE: 1.7133017654539238


In [29]:
df_results_DNN_EL

Unnamed: 0,R2,MAE,MSE,RMSE
DNN_EL,0.833591,0.365255,2.935403,1.713302
