In [1]:
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from pandas.plotting import parallel_coordinates
from PIL import Image
from io import BytesIO
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, LeaveOneOut

plt.style.use('classic')
%matplotlib inline

In [2]:
# read excel file
df = pd.read_excel('C:/Users/acc_a/OneDrive/PhOLED.xlsx', sheetname='Sheet2')

df = df[list(df.columns)[1:]]
print(df.shape)

#Checking for missing data
NAs = df.isnull().sum()
NAs.sort_values(0, ascending=False)
NAs[NAs > 0]

#drop the missing data
df.dropna(axis=0, inplace=True)
print(df.shape)

# copy original data
dfc = df.copy()
dfc.head()

(166, 32)
(165, 32)


Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL,efficiency
0,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,5.0
1,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,4.0
2,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,9.0
3,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,12.0
4,5.2,25.0,3.3,5.2,0,3.3,5.2,0,3.3,5.2,...,35,2.8,6.2,0,2.8,6.2,35.0,2.8,10.0,8.0


In [3]:
def insert_noise(variation):    
    # insert some noise +0.1 or -0.1
    random.seed(12)
    
    # variation in integer ie 1-6
    # change ln to df instead of dfc

    ones = pd.Series(np.ones(len(df)), name = "x")
    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['homo_HIL'] = dfc['homo_HIL'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_HTL1'] = dfc['lumo_HTL1'] + noise
    dfc['homo_HTL1'] = dfc['homo_HTL1'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_HTL2'] = dfc['lumo_HTL2'] + noise
    dfc['homo_HTL2'] = dfc['homo_HTL2'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_HTL3'] = dfc['lumo_HTL3'] + noise
    dfc['homo_HTL3'] = dfc['homo_HTL3'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_EML'] = dfc['lumo_EML'] + noise
    dfc['homo_EML'] = dfc['homo_EML'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_ETL1'] = dfc['lumo_ETL1'] + noise
    dfc['homo_ETL1'] = dfc['homo_ETL1'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['lumo_ETL2'] = dfc['lumo_ETL2'] + noise
    dfc['homo_ETL2'] = dfc['homo_ETL2'] + noise

    noise = ones.apply(lambda x: x + random.randrange(-1, 2, 1)* random.randrange(0, variation + 1, 1)*0.1) - 1
    dfc['cathode_workfunction'] = dfc['cathode_workfunction'] + noise
    
    dfc.dropna(axis=0, inplace=True)
    return(dfc.head())

In [4]:
rs = 7

def forest():
    print('forest')
    regressor = RandomForestRegressor(random_state=rs)
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    kfold = KFold(n_splits=5, shuffle=True, random_state=rs)
    loo = LeaveOneOut()
    r2_scores = cross_val_score(regressor, X, y.values.ravel(), cv=kfold)
    print('R2 scores: ', r2_scores)

    mse_scores = cross_val_score(regressor, X, y.values.ravel(), cv=loo, scoring='neg_mean_squared_error')
    
    mr2.append(r2_scores.mean())
    mrmse.append(np.sqrt(-mse_scores.mean()))
    tr2.append(regressor.score(X_test, y_test))
    trmse.append(np.sqrt(mean_squared_error(y_test, y_predictions)))
    

In [5]:
# divide dfc into training and testing
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

(165, 31)


In [6]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

# feed into algorithm
regressor = RandomForestRegressor(random_state=rs)
regressor.fit(X_train, y_train.values.ravel())
y_predictions = regressor.predict(X_test)

# print r2 score
print('R-squared test: ', regressor.score(X_test, y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_predictions)))

# see how changed feature rank on error
# flatten array using ravel()
dataset = pd.DataFrame({'true_y':list(y_test.values.ravel()), 'pred_y': list(y_predictions)}, 
                       columns=['true_y', 'pred_y'], index=y_test.index.values)

dataset['error'] = abs(dataset['true_y'] - dataset['pred_y'])
# print(len(dataset))
dataset = dataset.sort_values(by='error', ascending=False)

index = dataset.index.values[:21]
dataset.head()

(165, 31)
R-squared test:  0.802993835594
RMSE:  5.69588189937


Unnamed: 0,true_y,pred_y,error
117,57.6,44.440333,13.159667
105,45.4,33.71,11.69
142,25.0,35.64,10.64
107,42.9,32.43,10.47
63,15.2,25.556667,10.356667


In [7]:
# see the feature point for low scoring test data
X_test.loc[index, :].head()

Unnamed: 0,homo_HIL,thickness_HIL,lumo_HTL1,homo_HTL1,thickness_HTL1,lumo_HTL2,homo_HTL2,thickness_HTL2,lumo_HTL3,homo_HTL3,...,triplet_ETL,total_ETL_thickness,lumo_ETL1,homo_ETL1,thickness_ETL1,lumo_ETL2,homo_ETL2,thickness_ETL2,cathode_workfunction,thickness_EIL
117,5.2,60.0,2.0,5.5,45,2.0,5.5,0,2.0,5.5,...,2.75,35,2.73,6.68,0,2.73,6.68,35.0,3.5,0.5
105,9.5,10.0,2.5,5.2,10,2.4,5.7,5,2.39,5.75,...,2.75,50,2.73,6.68,0,2.73,6.68,50.0,2.8,1.0
142,5.3,5.0,2.5,5.2,70,2.4,5.7,5,2.4,5.7,...,2.75,35,2.73,6.68,0,2.73,6.68,35.0,2.2,2.0
107,9.5,10.0,2.5,5.2,10,2.4,5.7,5,2.46,5.71,...,2.75,50,2.73,6.68,0,2.73,6.68,50.0,2.8,1.0
63,5.2,60.0,2.2,5.8,10,2.2,5.8,0,2.2,5.8,...,2.7,30,2.91,6.56,0,2.91,6.56,30.0,3.5,1.0


In [8]:
# new dfc
def repeat(index, noise):
    dfc = df.copy()

    # insert noise to selected part of dfc
    dfc = dfc.loc[index, :] # change len in insert noise function so the index are the same
    insert_noise(noise)

    # rename changed part to new
    new = dfc
    dfc = df.copy()

    # replace part of dfc with new, 
    dfc.loc[index, :] = new
    dfc.dropna(axis=0, inplace=True)

    # split new dfc - train/test
    X = dfc.iloc[:, :-1]
    y = dfc[['efficiency']]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)

    # feed into algorithm
    regressor = RandomForestRegressor(random_state=rs)
    regressor.fit(X_train, y_train.values.ravel())
    y_predictions = regressor.predict(X_test)

    # print r2 score
    print('R-squared test: ', regressor.score(X_test, y_test))
    print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_predictions)))

    # see how changed feature rank on error
    # flatten array using ravel()
    dataset = pd.DataFrame({'true_y':list(y_test.values.ravel()), 'pred_y': list(y_predictions)}, 
                           columns=['true_y', 'pred_y'], index=y_test.index.values)

    dataset['error'] = abs(dataset['true_y'] - dataset['pred_y'])
    # print(len(dataset))
    dataset = dataset.sort_values(by='error', ascending=False)
    
    global new_index
    new_index = dataset.index.values[:21]
    
    return (dataset.iloc[:5])


### note

the test set got worse after adding noise

In [9]:
# new dfc
dfc = df.copy()

# insert noise to selected part of dfc
dfc = dfc.loc[index, :] # change len in insert noise function so the index are the same
insert_noise(6)

# rename changed part to new
new = dfc
dfc = df.copy()
print(dfc.shape)

# replace part of dfc with new, 
dfc.loc[index, :] = new
dfc.dropna(axis=0, inplace=True)
print(dfc.shape)

(165, 32)
(164, 32)


In [10]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

# feed into algorithm
regressor = RandomForestRegressor(random_state=rs)
regressor.fit(X_train, y_train.values.ravel())
y_predictions = regressor.predict(X_test)

# print r2 score
print('R-squared test: ', regressor.score(X_test, y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_predictions)))

# see how changed feature rank on error
# flatten array using ravel()
dataset = pd.DataFrame({'true_y':list(y_test.values.ravel()), 'pred_y': list(y_predictions)}, 
                       columns=['true_y', 'pred_y'], index=y_test.index.values)

dataset['error'] = abs(dataset['true_y'] - dataset['pred_y'])
# print(len(dataset))
dataset = dataset.sort_values(by='error', ascending=False)

index = dataset.index.values[:21]
dataset.head()

(164, 31)
R-squared test:  0.549648209321
RMSE:  9.19928077209


Unnamed: 0,true_y,pred_y,error
126,51.9,22.23,29.67
123,9.95,33.22,23.27
81,22.4,40.477436,18.077436
141,38.8,21.34,17.46
13,64.379487,49.269231,15.110256


In [11]:
# new dfc
dfc = df.copy()

# insert noise to selected part of dfc
dfc = dfc.loc[index, :] # change len in insert noise function so the index are the same
insert_noise(6)

# rename changed part to new
new = dfc
dfc = df.copy()
print(dfc.shape)
# replace part of dfc with new, 
dfc.loc[index, :] = new
dfc.dropna(axis=0, inplace=True)
dfc.shape

(165, 32)


(165, 32)

In [12]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

# feed into algorithm
regressor = RandomForestRegressor(random_state=rs)
regressor.fit(X_train, y_train.values.ravel())
y_predictions = regressor.predict(X_test)

# print r2 score
print('R-squared test: ', regressor.score(X_test, y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_predictions)))

# see how changed feature rank on error
# flatten array using ravel()
dataset = pd.DataFrame({'true_y':list(y_test.values.ravel()), 'pred_y': list(y_predictions)}, 
                       columns=['true_y', 'pred_y'], index=y_test.index.values)

dataset['error'] = abs(dataset['true_y'] - dataset['pred_y'])
# print(len(dataset))
dataset = dataset.sort_values(by='error', ascending=False)

index = dataset.index.values[:21]
dataset.head()

(165, 31)
R-squared test:  0.725942818117
RMSE:  6.71802571474


Unnamed: 0,true_y,pred_y,error
117,57.6,38.375333,19.224667
107,42.9,27.026282,15.873718
24,15.3,29.701538,14.401538
108,41.7,28.238333,13.461667
11,50.415385,39.015641,11.399744


In [13]:
# new dfc
dfc = df.copy()

# insert noise to selected part of dfc
dfc = dfc.loc[index, :] # change len in insert noise function so the index are the same
insert_noise(6)

# rename changed part to new
new = dfc
dfc = df.copy()
print(dfc.shape)
# replace part of dfc with new, 
dfc.loc[index, :] = new
dfc.dropna(axis=0, inplace=True)
dfc.shape

(165, 32)


(164, 32)

In [14]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

# feed into algorithm
regressor = RandomForestRegressor(random_state=rs)
regressor.fit(X_train, y_train.values.ravel())
y_predictions = regressor.predict(X_test)

# print r2 score
print('R-squared test: ', regressor.score(X_test, y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_predictions)))

# see how changed feature rank on error
# flatten array using ravel()
dataset = pd.DataFrame({'true_y':list(y_test.values.ravel()), 'pred_y': list(y_predictions)}, 
                       columns=['true_y', 'pred_y'], index=y_test.index.values)

dataset['error'] = abs(dataset['true_y'] - dataset['pred_y'])
# print(len(dataset))
dataset = dataset.sort_values(by='error', ascending=False)

index = dataset.index.values[:21]
dataset.head()

(164, 31)
R-squared test:  0.509913911
RMSE:  9.59652756057


Unnamed: 0,true_y,pred_y,error
126,51.9,21.98,29.92
123,9.95,34.472462,24.522462
81,22.4,44.361026,21.961026
13,64.379487,47.258718,17.120769
141,38.8,23.26,15.54


In [15]:
# new dfc
dfc = df.copy()

# insert noise to selected part of dfc
dfc = dfc.loc[index, :] # change len in insert noise function so the index are the same
insert_noise(6)

# rename changed part to new
new = dfc
dfc = df.copy()
print(dfc.shape)
# replace part of dfc with new
dfc.loc[index, :] = new
dfc.dropna(axis=0, inplace=True)
dfc.shape

(165, 32)


(165, 32)

In [16]:
X = dfc.iloc[:, :-1]
y = dfc[['efficiency']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
print(X.shape)

# feed into algorithm
regressor = RandomForestRegressor(random_state=rs)
regressor.fit(X_train, y_train.values.ravel())
y_predictions = regressor.predict(X_test)

# print r2 score
print('R-squared test: ', regressor.score(X_test, y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_predictions)))

# see how changed feature rank on error
# flatten array using ravel()
dataset = pd.DataFrame({'true_y':list(y_test.values.ravel()), 'pred_y': list(y_predictions)}, 
                       columns=['true_y', 'pred_y'], index=y_test.index.values)

dataset['error'] = abs(dataset['true_y'] - dataset['pred_y'])
# print(len(dataset))
dataset = dataset.sort_values(by='error', ascending=False)

index = dataset.index.values[:21]
dataset.head()

(165, 31)
R-squared test:  0.726290714878
RMSE:  6.71376032465


Unnamed: 0,true_y,pred_y,error
117,57.6,38.375333,19.224667
107,42.9,27.026282,15.873718
24,15.3,29.701538,14.401538
108,41.7,28.238333,13.461667
11,50.415385,39.015641,11.399744
