In [1]:
import sys, time, pickle
import sklearn.preprocessing
from scipy import stats
import pandas as pd
import colorlover as cl
import numpy as np
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
import plotly.io as pio
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
plotly.io.orca.config.executable = '/anaconda2/envs/pytorch/bin/orca'
init_notebook_mode(connected=True)

glob_layout = go.Layout(
    font=dict(family='Helvetica', size=24, color='black'),
    margin=dict(l=100, r=10, t=10, b=100),
    xaxis=dict(showgrid=False,  zeroline=False, ticks="inside", showline=True,
               tickwidth=3, linewidth=3, ticklen=10,
               mirror="allticks", color="black"),
    yaxis=dict(showgrid=False,  zeroline=False, ticks="inside", showline=True,
               tickwidth=3, linewidth=3, ticklen=10,
               mirror="allticks", color="black"),
    legend_orientation="v",
)

In [2]:
from perato_utils import is_pareto, area_under_pareto
from plot_utils import * # plot_pareto_front comes from here
from gpr import process_generation_2DEI, gp_predict
from ei import getEiVec2D_aug, get_ei_samples_kmeans, get_ei_samples_kmedoids

In [3]:
fnames = pickle.load(open('fnames.pkl', 'rb'))
df = pd.read_json('df_RAC155_homogap.json', orient='records', lines=True)
df = df.sample(n=int(1*len(df)), random_state=0)
df = df.reset_index()
y1l = 'alphaHOMO'
y2l = 'gap'

In [4]:
df_all = df.copy()
calibration_points = df.sample(n=int(100), random_state=1234)
df = df[~df["unique_name"].isin(calibration_points['unique_name'].values)]

In [5]:
tb_count = 0
tot_gen = 100
points_per_gen = 150
start_percentage = 1
known_points = df.sample(n=int(start_percentage*0.01*len(df)), random_state=1234)
pareto_inds_known, pareto_points_known = is_pareto(points=known_points[[y1l, y2l]].values)
print(len(known_points))
run_name = "100percent/random/start%d_initpoints_%d_totgen_%d_points_%d"%(start_percentage, len(known_points), tot_gen, points_per_gen)
basepath = '2DEI_alphaHOMO_gap/demo/%s/' % run_name

150


In [6]:
pareto_inds, pareto_points = is_pareto(points=df[[y1l, y2l]].values)
plot_pareto_front(df, pareto_points, y1l, y2l,
                  figname=basepath + 'global_pareto_front.pdf',
                  range1=[-1.05, 0], range2=[0, 0.35])

In [9]:
import time
start = time.time()
known_list = []
pareto_list = []
# In each generation, new known points are added.
while tb_count < tot_gen:
    tb_count += 1
    print("=====gen-%d=====" % tb_count)
    df['known'] = [True if idx in list(
        known_points.index) else False for idx, row in df.iterrows()]
    known_points = df[df['known'] == True]
    known_list.append(known_points)
    pareto_list.append(pareto_points_known)
    df_unknown = df[df['known']==False]
    new_points = df_unknown.sample(n=points_per_gen, random_state=0)
    print("adding %d points using random sampling." % len(new_points))
    known_points = known_points.append(new_points[known_points.columns])
    pareto_inds_known, pareto_points_known = is_pareto(
        points=known_points[[y1l, y2l]].values)
    plot_pareto_front(known_list[-1], pareto_list[-1], y1l, y2l,
                      global_fronts=pareto_points,
                      next_fronts=pareto_points_known,
                      next_points=new_points,
                      figname=basepath + '/actual_pareto_gen-%d_len%d.pdf' % (tb_count, len(known_points)),
                      show=False, range1=[-1, 0], range2=[0, 0.35])
    print("ellapse: ", time.time() - start)

=====gen-1=====
adding 150 points using kmedoids.
ellapse:  3.773862838745117
=====gen-2=====
adding 150 points using kmedoids.
ellapse:  7.209054946899414
=====gen-3=====
adding 150 points using kmedoids.
ellapse:  11.571571826934814
=====gen-4=====
adding 150 points using kmedoids.
ellapse:  15.943374872207642
=====gen-5=====
adding 150 points using kmedoids.
ellapse:  21.509262800216675
=====gen-6=====
adding 150 points using kmedoids.
ellapse:  27.493034839630127
=====gen-7=====
adding 150 points using kmedoids.
ellapse:  31.998856782913208
=====gen-8=====
adding 150 points using kmedoids.
ellapse:  36.642390966415405
=====gen-9=====
adding 150 points using kmedoids.
ellapse:  41.74357295036316
=====gen-10=====
adding 150 points using kmedoids.
ellapse:  46.94443988800049
=====gen-11=====
adding 150 points using kmedoids.
ellapse:  53.46531677246094
=====gen-12=====
adding 150 points using kmedoids.
ellapse:  59.91598677635193
=====gen-13=====
adding 150 points using kmedoids.
ella

ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
area_global = area_under_pareto(pareto_points, pareto_points)
pareto_area_list = [area_global/area_under_pareto(pareto_list[gen], pareto_points)  for gen in range(tot_gen)]
coverage_list = [len(known_list[gen])*1./len(df) for gen in range(tot_gen)]
plot_pareto_area(coverage_list, pareto_area_list,
                 figname=basepath+'area_under_pareto.pdf', show=True)
with open(basepath + "coverage_list.pkl", 'wb') as fo:
    pickle.dump(coverage_list, fo)
with open(basepath + "pareto_area_list.pkl", 'wb') as fo:
    pickle.dump(pareto_area_list, fo)