In [2]:
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from pandas.plotting import parallel_coordinates
from PIL import Image
from io import BytesIO
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, LeaveOneOut

plt.style.use('classic')
%matplotlib inline

In [3]:
# read excel file
df = pd.read_excel('C:/Users/acc_a/OneDrive/PhOLED.xlsx', sheetname='Sheet2')

df = df[list(df.columns)[1:]]
print(df.shape)

#Checking for missing data
NAs = df.isnull().sum()
NAs.sort_values(0, ascending=False)
NAs[NAs > 0]

#drop the missing data
df.dropna(axis=0, inplace=True)
print(df.shape)

# copy original data
dfc = df.copy()
dfc.head()

(166, 32)
(165, 32)


Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,5.0
1,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,4.0
2,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,9.0
3,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,12.0
4,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,8.0


In [4]:
def insert_noise(noise):
    # insert some noise +0.1 or -0.1
    random.seed(12)
    
    dfc['homo_HIL'] = dfc['homo_HIL'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)

    htl1 = (-1)**random.randrange(2)*noise
    dfc['lumo_HTL1'] = dfc['lumo_HTL1'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)
    dfc['homo_HTL1'] = dfc['homo_HTL1'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)

    htl2 = (-1)**random.randrange(2)*noise
    dfc['lumo_HTL2'] = dfc['lumo_HTL2'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)
    dfc['homo_HTL2'] = dfc['homo_HTL2'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)

    htl3 = (-1)**random.randrange(2)*noise
    dfc['lumo_HTL3'] = dfc['lumo_HTL3'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)
    dfc['homo_HTL3'] = dfc['homo_HTL3'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)

    host = (-1)**random.randrange(2)*noise
    dfc['lumo_EML'] = dfc['lumo_EML'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)
    dfc['homo_EML'] = dfc['homo_EML'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)

    etl1 = (-1)**random.randrange(2)*noise
    dfc['lumo_ETL1'] = dfc['lumo_ETL1'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)
    dfc['homo_ETL1'] = dfc['homo_ETL1'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)

    etl2 = (-1)**random.randrange(2)*noise
    dfc['lumo_ETL2'] = dfc['lumo_ETL2'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)
    dfc['homo_ETL2'] = dfc['homo_ETL2'].apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, noise + 1, 1)*0.1)

    eil = (-1)**random.randrange(2)*noise
    dfc['cathode_workfunction'] = dfc['cathode_workfunction'].apply(lambda x: x + random.randrange(-1, 2, 1)* 
                                                                    random.randrange(0, noise + 1, 1)*0.1)

    return(dfc.head())
    

In [5]:
def forest():
    print('forest')
    regressor = RandomForestRegressor(random_state=7)
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=7)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))
    
def xgboost():
    from xgboost import XGBRegressor
    print('xgb')
    regressor = XGBRegressor()
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=7)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))
    
def adaboost():
    from sklearn.ensemble import AdaBoostRegressor
    print('adaboost')
    regressor = AdaBoostRegressor()
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=7)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))

def gboost():
    from sklearn.ensemble import GradientBoostingRegressor
    print('gboost')
    regressor = GradientBoostingRegressor()
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=7)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))
    
def knn():
    from sklearn.neighbors import KNeighborsRegressor
    print('knn')
    regressor = KNeighborsRegressor()
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=7)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))

In [6]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
rs = 57
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s: meanR2 %s mrmse %s R2 %s RMSE %s" % (m, r, i, t, s)
    print(score)

(165, 31)
forest
R2 scores:  [ 0.80151769  0.64388326  0.46495314  0.40660986  0.64525202]




xgb
R2 scores:  [ 0.76691916  0.54680535  0.48937726  0.39486063  0.62817162]
adaboost
R2 scores:  [ 0.74726508  0.49631642  0.51666154  0.35578638  0.60267223]
gboost
R2 scores:  [ 0.78243074  0.59013326  0.45990646  0.44287033  0.61895461]
knn
R2 scores:  [ 0.58271507  0.36026012  0.525702    0.21806664  0.53436878]


forest: meanR2 0.592443193073 mrmse 8.35100297573 R2 0.687124439984 RMSE 7.96295128917
xgb: meanR2 0.565226804062 mrmse 8.57423555625 R2 0.646299465035 RMSE 8.46654259089
adaboost: meanR2 0.543740329588 mrmse 8.85307387083 R2 0.464220171367 RMSE 10.4203286995
gboost: meanR2 0.578859078221 mrmse 8.26603442298 R2 0.614786770586 RMSE 8.83565675041
knn: meanR2 0.444222522257 mrmse 9.57045539256 R2 0.443385065556 RMSE 10.6210062992


# Insert noise to energy level by +/- 0.1

In [7]:
dfc = df.copy()
insert_noise(1)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.2,0,3.4,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.1,35.0,2.7,10.0,5.0
1,5.3,25.0,3.3,5.3,0,3.3,5.3,0,3.2,5.2,...,35,2.8,6.2,0,2.9,6.3,35.0,2.8,10.0,4.0
2,5.1,25.0,3.3,5.3,0,3.4,5.2,0,3.4,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,9.0
3,5.1,25.0,3.3,5.2,0,3.3,5.2,0,3.4,5.2,...,35,2.8,6.1,0,2.8,6.3,35.0,2.9,10.0,12.0
4,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,8.0


In [8]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)
X.head()

(165, 31)


Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,triplet_ETL,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL
0,5.2,25.0,3.3,5.2,0,3.4,5.2,0,3.3,5.2,...,2.6,35,2.8,6.2,0,2.8,6.1,35.0,2.7,10.0
1,5.3,25.0,3.3,5.3,0,3.3,5.3,0,3.2,5.2,...,2.6,35,2.8,6.2,0,2.9,6.3,35.0,2.8,10.0
2,5.1,25.0,3.3,5.3,0,3.4,5.2,0,3.4,5.2,...,2.6,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0
3,5.1,25.0,3.3,5.2,0,3.3,5.2,0,3.4,5.2,...,2.6,35,2.8,6.1,0,2.8,6.3,35.0,2.9,10.0
4,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,2.6,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0


In [9]:
model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s: meanR2 %s mrmse %s R2 %s RMSE %s" % (m, r, i, t, s)
    print(score)

forest
R2 scores:  [ 0.83104105  0.62086741  0.39994939  0.48331502  0.63964616]
xgb
R2 scores:  [ 0.72240665  0.60612371  0.50900413  0.47337776  0.626288  ]
adaboost
R2 scores:  [ 0.72731419  0.48296934  0.49700258  0.44946604  0.60847708]
gboost
R2 scores:  [ 0.71477028  0.46511806  0.51657264  0.4625086   0.7077393 ]
knn
R2 scores:  [ 0.56402982  0.3642544   0.52504415  0.21636572  0.5375953 ]


forest: meanR2 0.594963805316 mrmse 8.48164457607 R2 0.64732504464 RMSE 8.45425901197
xgb: meanR2 0.587440050775 mrmse 8.32403631929 R2 0.632290065789 RMSE 8.6325861403
adaboost: meanR2 0.553045845856 mrmse 8.82951669248 R2 0.450931644106 RMSE 10.5487608503
gboost: meanR2 0.573341775413 mrmse 8.18752703182 R2 0.583291397976 RMSE 9.18976639892
knn: meanR2 0.441457878066 mrmse 9.63798394742 R2 0.397429969811 RMSE 11.0507563317


# Random noise by +/- 0.2

In [10]:
dfc = df.copy()
insert_noise(2)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.1,5.2,0,3.5,5.2,0,3.3,5.2,...,35,3.0,6.1,0,2.9,6.2,35.0,2.9,10.0,5.0
1,5.4,25.0,3.5,5.1,0,3.2,5.2,0,3.3,5.2,...,35,2.8,6.1,0,2.6,6.2,35.0,2.9,10.0,4.0
2,5.3,25.0,3.3,5.0,0,3.5,5.4,0,3.3,5.2,...,35,2.8,6.4,0,2.8,6.3,35.0,2.8,10.0,9.0
3,5.1,25.0,3.3,5.2,0,3.1,5.2,0,3.3,5.2,...,35,2.6,6.2,0,2.6,6.2,35.0,2.8,10.0,12.0
4,5.1,25.0,3.3,5.2,0,3.3,5.2,0,3.1,5.2,...,35,2.8,6.1,0,2.7,6.0,35.0,2.8,10.0,8.0


In [11]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s: meanR2 %s mrmse %s R2 %s RMSE %s" % (m, r, i, t, s)
    print(score)

(165, 31)
forest
R2 scores:  [ 0.7892634   0.60824587  0.46954811  0.46421774  0.62465467]
xgb
R2 scores:  [ 0.75839503  0.48266221  0.55152824  0.46476447  0.58197266]
adaboost
R2 scores:  [ 0.71003838  0.51704775  0.50105167  0.39466766  0.62425966]
gboost
R2 scores:  [ 0.78588628  0.5103943   0.43098842  0.42498094  0.59439082]
knn
R2 scores:  [ 0.58521419  0.39884208  0.53543909  0.18543871  0.55022338]


forest: meanR2 0.591185956733 mrmse 8.51401172639 R2 0.637902997182 RMSE 8.56644644455
xgb: meanR2 0.567864523056 mrmse 8.67396192125 R2 0.593508395794 RMSE 9.07640840036
adaboost: meanR2 0.549413024733 mrmse 8.7379390545 R2 0.444564176145 RMSE 10.6097507799
gboost: meanR2 0.549328151669 mrmse 8.67469278973 R2 0.543746782716 RMSE 9.61592813718
knn: meanR2 0.451031488216 mrmse 9.6249700427 R2 0.406480561868 RMSE 10.9674512477


# +/- 0.3 noise

In [12]:
dfc = df.copy()
insert_noise(3)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.1,0,3.5,5.2,0,3.3,5.2,...,35,2.8,6.1,0,2.8,6.0,35.0,2.6,10.0,5.0
1,5.4,25.0,3.3,5.5,0,3.3,5.4,0,3.1,5.2,...,35,2.8,6.1,0,3.1,6.5,35.0,2.7,10.0,4.0
2,4.9,25.0,3.3,5.4,0,3.6,5.2,0,3.5,5.1,...,35,2.8,6.2,0,2.8,6.2,35.0,2.9,10.0,9.0
3,5.0,25.0,3.4,5.2,0,3.3,5.2,0,3.6,5.2,...,35,2.8,5.9,0,2.8,6.5,35.0,3.0,10.0,12.0
4,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.1,35.0,2.8,10.0,8.0


In [13]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s: meanR2 %s mrmse %s R2 %s RMSE %s" % (m, r, i, t, s)
    print(score)

(165, 31)
forest
R2 scores:  [ 0.65935292  0.61508729  0.27512617  0.50311455  0.67874571]
xgb
R2 scores:  [ 0.66054278  0.50591022  0.44397438  0.36081076  0.54379584]
adaboost
R2 scores:  [ 0.65082644  0.52017161  0.37234175  0.40958227  0.57180098]
gboost
R2 scores:  [ 0.64266031  0.45042192  0.31147992  0.46951912  0.5711238 ]
knn
R2 scores:  [ 0.56538775  0.35366378  0.52141943  0.2194256   0.53717077]


forest: meanR2 0.546285328728 mrmse 8.62129285157 R2 0.589015769445 RMSE 9.12642771296
xgb: meanR2 0.503006799173 mrmse 9.16359970851 R2 0.532719116904 RMSE 9.73144306487
adaboost: meanR2 0.50494461023 mrmse 8.87361936747 R2 0.470628929886 RMSE 10.3578195565
gboost: meanR2 0.489041014819 mrmse 9.15168877637 R2 0.560665375538 RMSE 9.43595704131
knn: meanR2 0.439413464873 mrmse 9.6398721221 R2 0.397833417496 RMSE 11.0470562235


# +/- 0.4 noise

In [14]:
dfc = df.copy()
insert_noise(4)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.6,0,3.4,5.2,0,3.4,5.2,...,35,3.2,6.2,0,2.8,6.2,35.0,2.8,10.0,5.0
1,5.6,25.0,3.5,5.2,0,3.1,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.6,6.2,35.0,2.8,10.0,4.0
2,5.4,25.0,3.3,5.0,0,3.3,5.4,0,3.3,5.4,...,35,3.0,6.2,0,2.8,6.4,35.0,2.5,10.0,9.0
3,4.9,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.0,...,35,2.8,6.2,0,2.8,6.1,35.0,2.6,10.0,12.0
4,5.0,25.0,3.3,5.5,0,3.0,5.2,0,3.3,5.1,...,35,2.7,6.4,0,2.8,6.3,35.0,2.8,10.0,8.0


In [15]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s: meanR2 %s mrmse %s R2 %s RMSE %s" % (m, r, i, t, s)
    print(score)

(165, 31)
forest
R2 scores:  [ 0.62654146  0.50125011  0.42428117  0.48398389  0.537993  ]
xgb
R2 scores:  [ 0.67543722  0.42494548  0.56254586  0.38969436  0.62233282]
adaboost
R2 scores:  [ 0.67371169  0.59640842  0.48974837  0.4320224   0.61307833]
gboost
R2 scores:  [ 0.69648101  0.45761771  0.41121801  0.35870279  0.63849245]
knn
R2 scores:  [ 0.56798144  0.38880561  0.52691234  0.2147984   0.54293296]


forest: meanR2 0.514809926739 mrmse 8.99754070808 R2 0.521127459977 RMSE 9.85140573576
xgb: meanR2 0.534991147502 mrmse 9.03632409803 R2 0.476723511625 RMSE 10.2980228212
adaboost: meanR2 0.560993843446 mrmse 9.16136928151 R2 0.400540384661 RMSE 11.0221979014
gboost: meanR2 0.512502391186 mrmse 8.96978300218 R2 0.525775300665 RMSE 9.80348128328
knn: meanR2 0.448286150383 mrmse 9.56529755422 R2 0.415267108585 RMSE 10.8859666869


# +/- 0.5 noise

In [16]:
dfc = df.copy()
insert_noise(5)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,2.9,5.1,0,3.7,5.2,0,3.3,5.3,...,35,3.2,6.0,0,3.1,6.2,35.0,3.1,10.0,5.0
1,5.6,25.0,3.8,4.9,0,3.0,5.2,0,3.3,5.3,...,35,2.8,6.0,0,2.4,6.2,35.0,3.1,10.0,4.0
2,5.4,25.0,3.3,4.8,0,3.8,5.7,0,3.3,5.2,...,35,2.8,6.6,0,2.8,6.4,35.0,2.8,10.0,9.0
3,4.9,25.0,3.3,5.2,0,2.8,5.2,0,3.3,5.2,...,35,2.3,6.2,0,2.3,6.2,35.0,2.7,10.0,12.0
4,5.0,25.0,3.3,5.1,0,3.3,5.1,0,2.9,5.2,...,35,2.8,5.9,0,2.5,5.8,35.0,2.7,10.0,8.0


In [17]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s: meanR2 %s mrmse %s R2 %s RMSE %s" % (m, r, i, t, s)
    print(score)

(165, 31)
forest
R2 scores:  [ 0.66829333  0.60168307  0.48691638  0.52122831  0.51062   ]
xgb
R2 scores:  [ 0.66940364  0.47593681  0.52585886  0.45813925  0.56799387]
adaboost
R2 scores:  [ 0.68778616  0.61348902  0.22634059  0.47910879  0.56617461]
gboost
R2 scores:  [ 0.65691673  0.47866355  0.45913552  0.44330457  0.63206333]
knn
R2 scores:  [ 0.57314106  0.38508998  0.53958343  0.20287467  0.55052135]


forest: meanR2 0.557748217985 mrmse 8.30033166693 R2 0.653830784874 RMSE 8.37591884531
xgb: meanR2 0.539466484223 mrmse 9.02737413831 R2 0.424334233837 RMSE 10.801235642
adaboost: meanR2 0.514579833048 mrmse 8.77251416191 R2 0.347813234656 RMSE 11.4967272935
gboost: meanR2 0.534016737307 mrmse 9.06645039852 R2 0.501697790549 RMSE 10.0492730234
knn: meanR2 0.450242100581 mrmse 9.73522927037 R2 0.392726175101 RMSE 11.0938048056


# +/- 0.6 noise

In [18]:
dfc = df.copy()
insert_noise(6)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.2,0,3.3,5.8,0,3.3,4.7,...,35,2.8,6.2,0,2.8,6.2,35.0,2.4,10.0,5.0
1,5.6,25.0,2.7,5.2,0,3.7,4.8,0,3.3,4.8,...,35,2.7,6.2,0,2.2,6.2,35.0,2.8,10.0,4.0
2,5.4,25.0,3.0,5.2,0,2.9,5.6,0,3.6,4.6,...,35,2.8,6.2,0,2.8,6.2,35.0,2.6,10.0,9.0
3,4.9,25.0,3.8,5.1,0,3.5,5.1,0,2.7,5.4,...,35,2.8,6.2,0,2.8,6.5,35.0,2.6,10.0,12.0
4,5.0,25.0,3.3,5.6,0,3.3,4.8,0,3.2,5.2,...,35,2.5,6.8,0,2.8,5.7,35.0,2.3,10.0,8.0


In [19]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s: meanR2 %s mrmse %s R2 %s RMSE %s" % (m, r, i, t, s)
    print(score)

(165, 31)
forest
R2 scores:  [ 0.65328054  0.48158875  0.39594366  0.43306421  0.61798013]
xgb
R2 scores:  [ 0.66163355  0.38686147  0.51848508  0.34488901  0.62766643]
adaboost
R2 scores:  [ 0.55730861  0.46529279  0.40115445  0.40464268  0.56791569]
gboost
R2 scores:  [ 0.71933631  0.42578568  0.37266132  0.35081858  0.64113092]
knn
R2 scores:  [ 0.59168462  0.39760939  0.54588342  0.24065743  0.52919074]


forest: meanR2 0.516371458028 mrmse 8.90341372294 R2 0.610518117066 RMSE 8.884477044
xgb: meanR2 0.507907106718 mrmse 8.9956864649 R2 0.486681568309 RMSE 10.1995654338
adaboost: meanR2 0.47926284494 mrmse 9.35355247072 R2 0.441087338164 RMSE 10.6429056789
gboost: meanR2 0.501946559436 mrmse 8.99481270339 R2 0.429336224439 RMSE 10.7542070037
knn: meanR2 0.461005121402 mrmse 9.58860123206 R2 0.406610553312 RMSE 10.9662501473
