In [1]:
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from pandas.plotting import parallel_coordinates
from PIL import Image
from io import BytesIO
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, LeaveOneOut

plt.style.use('classic')
%matplotlib inline

In [2]:
# read excel file
df = pd.read_excel('C:/Users/acc_a/OneDrive/PhOLED.xlsx', sheetname='Sheet2')

df = df[list(df.columns)[1:]]
print(df.shape)

#Checking for missing data
NAs = df.isnull().sum()
NAs.sort_values(0, ascending=False)
NAs[NAs > 0]

#drop the missing data
df.dropna(axis=0, inplace=True)
print(df.shape)

# copy original data
dfc = df.copy()
dfc.head()

(166, 32)
(165, 32)


Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,5.0
1,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,4.0
2,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,9.0
3,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,12.0
4,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,8.0


In [3]:
def insert_noise(variation):
    # insert some noise +0.1 or -0.1
    random.seed(12)
    
#     variation in integer ie 1-6
    
    ones = pd.Series(np.ones(len(dfc)), name = "x")
    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['homo_HIL'] = dfc['homo_HIL'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_HTL1'] = dfc['lumo_HTL1'] + noise
    dfc['homo_HTL1'] = dfc['homo_HTL1'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_HTL2'] = dfc['lumo_HTL2'] + noise
    dfc['homo_HTL2'] = dfc['homo_HTL2'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_HTL3'] = dfc['lumo_HTL3'] + noise
    dfc['homo_HTL3'] = dfc['homo_HTL3'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_EML'] = dfc['lumo_EML'] + noise
    dfc['homo_EML'] = dfc['homo_EML'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_ETL1'] = dfc['lumo_ETL1'] + noise
    dfc['homo_ETL1'] = dfc['homo_ETL1'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_ETL2'] = dfc['lumo_ETL2'] + noise
    dfc['homo_ETL2'] = dfc['homo_ETL2'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['cathode_workfunction'] = dfc['cathode_workfunction'] + noise
    
    dfc.dropna(axis=0, inplace=True)
    return(dfc.head())

In [10]:
rs = 7

def forest():
    print('forest')
    regressor = RandomForestRegressor(random_state=rs)
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=rs)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))
    
def xgboost():
    from xgboost import XGBRegressor
    print('xgb')
    regressor = XGBRegressor()
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=rs)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))
    
def adaboost():
    from sklearn.ensemble import AdaBoostRegressor
    print('adaboost')
    regressor = AdaBoostRegressor(random_state=rs)
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=rs)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))

def gboost():
    from sklearn.ensemble import GradientBoostingRegressor
    print('gboost')
    regressor = GradientBoostingRegressor(random_state=rs)
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=rs)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))
    
def knn():
    from sklearn.neighbors import KNeighborsRegressor
    print('knn')
    regressor = KNeighborsRegressor()
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=rs)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))

In [42]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
print('in order of meanR2, mrmse, R2, RMSE')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s:  %s  %s  %s  %s" % (m, r, i, t, s)
    print(score)

(165, 31)
forest
R2 scores:  [ 0.80151769  0.64388326  0.46495314  0.40660986  0.64525202]
xgb
R2 scores:  [ 0.76691916  0.54680535  0.48937726  0.39486063  0.62817162]
adaboost
R2 scores:  [ 0.78638443  0.50587841  0.49717242  0.33118365  0.55812507]
gboost
R2 scores:  [ 0.80741015  0.59650479  0.45478058  0.46285506  0.60093368]
knn
R2 scores:  [ 0.58271507  0.36026012  0.525702    0.21806664  0.53436878]


in order of meanR2, mrmse, R2, RMSE
forest:  0.592443193073  8.35100297573  0.802993835594  5.69588189937
xgb:  0.565226804062  8.57423555625  0.682470383837  7.23124685275
adaboost:  0.535748794746  8.64385801442  0.732080119541  6.64237726508
gboost:  0.584496852025  8.31211382507  0.71257084436  6.87996894045
knn:  0.444222522257  9.57045539256  0.481476888979  9.24070544751


# Insert noise to energy level by +/- 0.1

In [13]:
dfc = df.copy()
insert_noise(1)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.4,5.3,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,5.0
1,5.3,25.0,3.3,5.2,0,3.4,5.3,0,3.4,5.3,...,35,2.7,6.1,0,2.8,6.2,35.0,2.8,10.0,4.0
2,5.1,25.0,3.3,5.2,0,3.4,5.3,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,9.0
3,5.1,25.0,3.3,5.2,0,3.3,5.2,0,3.4,5.3,...,35,2.9,6.3,0,2.8,6.2,35.0,2.8,10.0,12.0
4,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.9,6.3,0,2.8,6.2,35.0,2.8,10.0,8.0


In [14]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)
X.head()

(164, 31)


Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,triplet_ETL,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL
0,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.4,5.3,...,2.6,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0
1,5.3,25.0,3.3,5.2,0,3.4,5.3,0,3.4,5.3,...,2.6,35,2.7,6.1,0,2.8,6.2,35.0,2.8,10.0
2,5.1,25.0,3.3,5.2,0,3.4,5.3,0,3.3,5.2,...,2.6,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0
3,5.1,25.0,3.3,5.2,0,3.3,5.2,0,3.4,5.3,...,2.6,35,2.9,6.3,0,2.8,6.2,35.0,2.8,10.0
4,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,2.6,35,2.9,6.3,0,2.8,6.2,35.0,2.8,10.0


In [15]:
model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
print('in order of meanR2, mrmse, R2, RMSE')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s:  %s  %s  %s  %s" % (m, r, i, t, s)
    print(score)

forest
R2 scores:  [ 0.49328395  0.67418293  0.52632633  0.51513775  0.5813136 ]
xgb
R2 scores:  [ 0.57063648  0.60125539  0.42207582  0.33513526  0.57206204]
adaboost
R2 scores:  [ 0.49801991  0.60434005  0.46842339  0.38846722  0.5974812 ]
gboost
R2 scores:  [ 0.55934     0.67869939  0.53043058  0.29470189  0.5863968 ]
knn
R2 scores:  [ 0.42368202  0.44139808  0.43705956  0.12814015  0.5475308 ]


in order of meanR2, mrmse, R2, RMSE
forest:  0.558048912393  8.31265798977  0.577617642716  8.90903801451
xgb:  0.50023300038  9.05423073587  0.597260550908  8.69941427881
adaboost:  0.511346353273  8.87731597683  0.612388387647  8.53446530068
gboost:  0.52991373189  9.10298838717  0.556098266245  9.1331659291
knn:  0.395562122818  9.60183728719  0.394581482265  10.6661031037


# Random noise by +/- 0.2

In [16]:
dfc = df.copy()
insert_noise(2)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,3.0,6.4,0,2.7,6.1,35.0,2.8,10.0,5.0
1,5.4,25.0,3.5,5.4,0,3.3,5.2,0,3.5,5.4,...,35,2.7,6.1,0,2.6,6.0,35.0,2.8,10.0,4.0
2,5.3,25.0,3.4,5.3,0,3.3,5.2,0,3.2,5.1,...,35,2.8,6.2,0,2.6,6.0,35.0,2.7,10.0,9.0
3,5.1,25.0,3.3,5.2,0,3.4,5.3,0,3.5,5.4,...,35,2.7,6.1,0,2.7,6.1,35.0,2.8,10.0,12.0
4,5.1,25.0,3.2,5.1,0,3.3,5.2,0,3.1,5.0,...,35,2.9,6.3,0,2.7,6.1,35.0,2.8,10.0,8.0


In [17]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
print('in order of meanR2, mrmse, R2, RMSE')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s:  %s  %s  %s  %s" % (m, r, i, t, s)
    print(score)

(164, 31)
forest
R2 scores:  [ 0.47254337  0.59669539  0.49978338  0.47790216  0.59903395]
xgb
R2 scores:  [ 0.46414569  0.57731857  0.43130284  0.33604438  0.55163664]
adaboost
R2 scores:  [ 0.51587153  0.52877409  0.42379825  0.44960665  0.51914626]
gboost
R2 scores:  [ 0.480628    0.60141883  0.44027983  0.37945126  0.5726303 ]
knn
R2 scores:  [ 0.42204497  0.43063962  0.44146604  0.1574853   0.54442318]


in order of meanR2, mrmse, R2, RMSE
forest:  0.529191648077  8.75325247818  0.464494908419  10.0313579918
xgb:  0.472089623163  9.08059263946  0.522856821402  9.46896010866
adaboost:  0.487439356255  8.92257730354  0.512794990717  9.56827832471
gboost:  0.494881642801  9.17096353525  0.492477093486  9.76575335447
knn:  0.399211821468  9.66467826878  0.383491520829  10.763349789


# +/- 0.3 noise

In [18]:
dfc = df.copy()
insert_noise(3)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.2,0,3.2,5.1,0,3.6,5.5,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,5.0
1,5.4,25.0,3.3,5.2,0,3.6,5.5,0,3.5,5.4,...,35,2.6,6.0,0,2.8,6.2,35.0,2.8,10.0,4.0
2,4.9,25.0,3.3,5.2,0,3.5,5.4,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,9.0
3,5.0,25.0,3.4,5.3,0,3.3,5.2,0,3.6,5.5,...,35,3.0,6.4,0,2.8,6.2,35.0,2.8,10.0,12.0
4,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,3.0,6.4,0,2.7,6.1,35.0,2.9,10.0,8.0


In [19]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
print('in order of meanR2, mrmse, R2, RMSE')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s:  %s  %s  %s  %s" % (m, r, i, t, s)
    print(score)

(164, 31)
forest
R2 scores:  [ 0.57995445  0.68892559  0.39539542  0.51173935  0.4486158 ]
xgb
R2 scores:  [ 0.5847782   0.60735583  0.3971819   0.38684554  0.51635859]
adaboost
R2 scores:  [ 0.57747698  0.57707169  0.40316689  0.3199631   0.55682144]
gboost
R2 scores:  [ 0.62488791  0.63656172  0.38829853  0.32540348  0.50910133]
knn
R2 scores:  [ 0.4237961   0.44565269  0.43794061  0.15098877  0.54228562]


in order of meanR2, mrmse, R2, RMSE
forest:  0.524926122409  8.44560957223  0.492193208121  9.76848423306
xgb:  0.49850401508  8.89673620473  0.630567053812  8.33193242384
adaboost:  0.486900021115  8.85074434376  0.54440616164  9.25266520663
gboost:  0.496850592467  8.82487754945  0.568846359108  9.00106620759
knn:  0.40013275803  9.65958125691  0.404179730149  10.5812156154


# +/- 0.4 noise

In [20]:
dfc = df.copy()
insert_noise(4)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.7,5.6,0,3.3,5.2,0,3.2,5.1,...,35,2.9,6.3,0,2.6,6.0,35.0,2.8,10.0,5.0
1,5.6,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,3.0,6.4,0,2.7,6.1,35.0,2.8,10.0,4.0
2,5.4,25.0,3.3,5.2,0,3.7,5.6,0,3.2,5.1,...,35,2.4,5.8,0,2.8,6.2,35.0,3.2,10.0,9.0
3,4.9,25.0,3.5,5.4,0,3.3,5.2,0,3.1,5.0,...,35,2.6,6.0,0,2.8,6.2,35.0,3.0,10.0,12.0
4,5.0,25.0,3.3,5.2,0,3.1,5.0,0,3.3,5.2,...,35,2.5,5.9,0,2.8,6.2,35.0,2.8,10.0,8.0


In [21]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
print('in order of meanR2, mrmse, R2, RMSE')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s:  %s  %s  %s  %s" % (m, r, i, t, s)
    print(score)

(164, 31)
forest
R2 scores:  [ 0.61722657  0.67791361  0.49833279  0.66266432  0.56772143]
xgb
R2 scores:  [ 0.46027892  0.6000891   0.61522789  0.49233186  0.58532973]
adaboost
R2 scores:  [ 0.47145486  0.64498183  0.47864578  0.4541017   0.67360143]
gboost
R2 scores:  [ 0.4409389   0.58192778  0.5012171   0.42149183  0.56448167]
knn
R2 scores:  [ 0.43256939  0.41896764  0.4095711   0.18226026  0.53977095]


in order of meanR2, mrmse, R2, RMSE
forest:  0.604771743838  8.9383674055  0.395182961245  10.6608034384
xgb:  0.550651497888  9.04101782739  0.418918295933  10.4495244859
adaboost:  0.544557119949  9.01926755621  0.515096142882  9.5456552766
gboost:  0.502011455325  8.98826346816  0.286888345372  11.5759513898
knn:  0.396627866852  9.69814731579  0.368173704668  10.8962428859


# +/- 0.5 noise

In [26]:
dfc = df.copy()
insert_noise(5)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,3.2,6.6,0,2.6,6.0,35.0,2.8,10.0,5.0
1,5.6,25.0,3.7,5.6,0,3.3,5.2,0,3.7,5.6,...,35,2.6,6.0,0,2.3,5.7,35.0,2.8,10.0,4.0
2,5.4,25.0,3.5,5.4,0,3.3,5.2,0,3.0,4.9,...,35,2.9,6.3,0,2.3,5.7,35.0,2.6,10.0,9.0
3,4.9,25.0,3.3,5.2,0,3.5,5.4,0,3.8,5.7,...,35,2.6,6.0,0,2.5,5.9,35.0,2.7,10.0,12.0
4,5.0,25.0,3.1,5.0,0,3.2,5.1,0,2.8,4.7,...,35,3.0,6.4,0,2.6,6.0,35.0,2.8,10.0,8.0


In [27]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
print('in order of meanR2, mrmse, R2, RMSE')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s:  %s  %s  %s  %s" % (m, r, i, t, s)
    print(score)

(164, 31)
forest
R2 scores:  [ 0.47406548  0.57967424  0.46116634  0.38637176  0.60920324]
xgb
R2 scores:  [ 0.47734738  0.59413617  0.409579    0.37518266  0.46846496]
adaboost
R2 scores:  [ 0.44647059  0.55967947  0.4520792   0.47178979  0.50294166]
gboost
R2 scores:  [ 0.50339478  0.63863238  0.39671199  0.41453874  0.50816542]
knn
R2 scores:  [ 0.42201955  0.44603821  0.43514146  0.08940641  0.54234286]


in order of meanR2, mrmse, R2, RMSE
forest:  0.502096210782  8.99413153995  0.467009750696  10.00777561
xgb:  0.464942035402  9.21673872125  0.475252804786  9.93008555361
adaboost:  0.486592143092  9.34475524261  0.478856356833  9.8959307971
gboost:  0.492288659494  9.27574966682  0.467389699874  10.0042078862
knn:  0.38698970013  9.65664364899  0.380720682265  10.7875100974


# +/- 0.6 noise

In [24]:
dfc = df.copy()
insert_noise(6)

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.1,5.0,...,35,2.4,5.8,0,2.8,6.2,35.0,2.6,10.0,5.0
1,5.6,25.0,2.7,4.6,0,3.3,5.2,0,3.7,5.6,...,35,2.5,5.9,0,2.2,5.6,35.0,3.2,10.0,4.0
2,5.4,25.0,3.0,4.9,0,3.3,5.2,0,3.4,5.3,...,35,2.8,6.2,0,2.3,5.7,35.0,2.8,10.0,9.0
3,4.9,25.0,3.8,5.7,0,3.2,5.1,0,3.9,5.8,...,35,2.8,6.2,0,2.4,5.8,35.0,2.8,10.0,12.0
4,5.0,25.0,3.3,5.2,0,3.7,5.6,0,3.5,5.4,...,35,3.1,6.5,0,2.2,5.6,35.0,2.8,10.0,8.0


In [25]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
print('in order of meanR2, mrmse, R2, RMSE')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s:  %s  %s  %s  %s" % (m, r, i, t, s)
    print(score)

(164, 31)
forest
R2 scores:  [ 0.66096803  0.6565389   0.50936928  0.57316203  0.61835617]
xgb
R2 scores:  [ 0.60206496  0.64718912  0.58083428  0.3652684   0.57477698]
adaboost
R2 scores:  [ 0.53470428  0.50890184  0.39375607  0.41984231  0.56236178]
gboost
R2 scores:  [ 0.60069842  0.58669276  0.50500636  0.30201048  0.52124111]
knn
R2 scores:  [ 0.4398158   0.44935276  0.46721646  0.10888387  0.55649308]


in order of meanR2, mrmse, R2, RMSE
forest:  0.603678884076  9.13045985192  0.584403951232  8.8371785998
xgb:  0.554026747364  8.85051144127  0.588097751884  8.79781870668
adaboost:  0.483913256167  9.28973168565  0.541068200409  9.2864987122
gboost:  0.503129824243  9.09979350123  0.602496234821  8.64268233388
knn:  0.40435239642  9.63891586169  0.386353172448  10.7383405767


In [116]:
regressor = RandomForestRegressor(random_state=rs)
regressor.fit(X_train, y_train.values.ravel())
y_predictions = regressor.predict(X_test)

# flatten array using ravel()
dataset = pd.DataFrame({'true_y':list(y_test.values.ravel()), 'pred_y': list(y_predictions)}, 
                       columns=['true_y', 'pred_y'], index=y_test.index.values)

dataset['error'] = abs(dataset['true_y'] - dataset['pred_y'])
# print(len(dataset))
dataset = dataset.sort_values(by='error', ascending=False)

index = dataset.index.values[:21]

In [119]:
X_test.iloc[index]

IndexError: positional indexers are out-of-bounds

# Changing train: test ratio

In [28]:
dfc = df.copy()

X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
print('in order of meanR2, mrmse, R2, RMSE')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s:  %s  %s  %s  %s" % (m, r, i, t, s)
    print(score)

(165, 31)
forest
R2 scores:  [ 0.80151769  0.64388326  0.46495314  0.40660986  0.64525202]
xgb
R2 scores:  [ 0.76691916  0.54680535  0.48937726  0.39486063  0.62817162]
adaboost
R2 scores:  [ 0.78638443  0.50587841  0.49717242  0.33118365  0.55812507]
gboost
R2 scores:  [ 0.80741015  0.59650479  0.45478058  0.46285506  0.60093368]
knn
R2 scores:  [ 0.58271507  0.36026012  0.525702    0.21806664  0.53436878]


in order of meanR2, mrmse, R2, RMSE
forest:  0.592443193073  8.35100297573  0.577024462919  8.75021553265
xgb:  0.565226804062  8.57423555625  0.568966767614  8.83316825362
adaboost:  0.535748794746  8.64385801442  0.585216831838  8.66506225963
gboost:  0.584496852025  8.31211382507  0.565036454603  8.8733488378
knn:  0.444222522257  9.57045539256  0.335590638886  10.9667765989


In [29]:
dfc = df.copy()

X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=rs)
print(X.shape)

model = ['forest', 'xgb', 'adaboost', 'gboost', 'knn']
mr2 = []
mrmse = []
tr2 = []
trmse = []
forest()
xgboost()
adaboost()
gboost()
knn()

score = [model, mr2, mrmse, tr2, trmse]
print('\n')
print('in order of meanR2, mrmse, R2, RMSE')
for m, r, i, t, s in zip(model, mr2, mrmse, tr2, trmse):
    score = "%s:  %s  %s  %s  %s" % (m, r, i, t, s)
    print(score)

(165, 31)
forest
R2 scores:  [ 0.80151769  0.64388326  0.46495314  0.40660986  0.64525202]
xgb
R2 scores:  [ 0.76691916  0.54680535  0.48937726  0.39486063  0.62817162]
adaboost
R2 scores:  [ 0.78638443  0.50587841  0.49717242  0.33118365  0.55812507]
gboost
R2 scores:  [ 0.80741015  0.59650479  0.45478058  0.46285506  0.60093368]
knn
R2 scores:  [ 0.58271507  0.36026012  0.525702    0.21806664  0.53436878]


in order of meanR2, mrmse, R2, RMSE
forest:  0.592443193073  8.35100297573  0.262360074889  11.5784994813
xgb:  0.565226804062  8.57423555625  0.275195188106  11.4773230798
adaboost:  0.535748794746  8.64385801442  0.325831704204  11.0691490525
gboost:  0.584496852025  8.31211382507  0.322397097564  11.0973095832
knn:  0.444222522257  9.57045539256  0.03137938766  13.2680447099
