# Main Visualization File
> Most of the inital code is ported from main.py

In [11]:
!pip3 install mmh3
!pip3 install torch



In [12]:
from table import Table
from sketches import Synopsis
import correlation
from RL_Agent import Autofeature_agent
from RL_Environment import ISOFAEnvironment
import pandas as pd
import numpy as np
import sketches
import os
from tqdm import tqdm

## Model Training

In [6]:
gcdata = "data/gc-data/"  # data directory
joinable = "DayLoc"  # feature that is joinable between tables
target = "Temperature"
# TODO: drop Date and Location on on the csv's

# define core table
print("Sketching Core Table...")
core_path = gcdata+"temp.csv"
t_core = Table(joinable, core_path)

y_feat = t_core.table[[joinable, target]]
t_core.table.drop([target], axis=1, inplace=True)
print(t_core.table.shape)

t_core.get_sketch()
core_syn = sketches.Synopsis(y_feat, attributes=[target], key=joinable) 
t_core.calc_corr_gain(core_syn)

Sketching Core Table...
(11959, 5)
Observed correlation: 0.9967236563887053
f0_ Temperature
Correlation bounds: 0.9914217299462094, 1.0023880394688534
Bootstrap 95% confidence interval for correlation: (0.9966099209170419, 0.9969456324960341)




Observed mutual info: 1.3934741864764607e-05
Observed correlation: 0.46844979302250733
f1_ Temperature
Correlation bounds: 0.35870779209686204, 0.6862245753802713
Bootstrap 95% confidence interval for correlation: (0.4417608372611969, 0.48294171228194777)




Observed mutual info: 1.5329966930149245e-06
Observed correlation: -0.04337375987667828
f2_ Temperature
Correlation bounds: -0.0560689785493934, 1
Bootstrap 95% confidence interval for correlation: (-0.06703062313611156, -0.019530919278972436)




Observed mutual info: 1.5582162168576519e-07
Observed correlation: -0.04306752506070069
f3_ Temperature
Correlation bounds: -0.06967795385932417, 1
Bootstrap 95% confidence interval for correlation: (-0.0660572668140623, -0.011198263063210055)




Observed mutual info: 2.2326064657877914e-07


### Instantiating Candidates

In [7]:
# define candidate tables
candidate_paths = [file for file in os.listdir(gcdata) if "temp" not in file]
t_candidates = []
for path in tqdm(candidate_paths):
    print("\n\nLooking at table", path)
    t_cand = Table(joinable, gcdata+path)
    # get rid of target variable in candidate table
    if target in t_cand.table.columns:
        t_cand.table.drop([target], axis=1, inplace=True)
    assert joinable in t_cand.table.columns, f"{joinable=} not found in {path}"
    # rename columns for less confusion on join
    renamer = dict([[col, path+'-'+col] for col in t_cand.table.columns if joinable not in col])
    t_cand.table = t_cand.table.rename(columns=renamer)
    
    # use synopsys for join estimation
    t_cand.get_sketch()  # ? sketch candidate table again
    # TODO: check missing values in join of core with voc_daily_summary.csv (nan vals in calc mutual info)
    t_cand.calc_corr_gain(core_syn)  # ? calculate correlation between candidate and itself
    # ? get feature-wise sketch
    t_cand.feature_scoring(20)
    for feat in t_core.df_sketch:
        if t_core.df_sketch[feat].dtype == 'object':
            t_core.df_sketch[feat] = t_core.df_sketch[feat].astype('category')
    for feat in t_cand.df_sketch:
        if t_cand.df_sketch[feat].dtype == 'object':
            t_cand.df_sketch[feat] = t_cand.df_sketch[feat].astype('category')
    t_candidates.append(t_cand)
    print('\n')

  0%|          | 0/9 [00:00<?, ?it/s]



Looking at table pm10_daily_summary.csv
Observed correlation: -0.04052077184093939
pm10_daily_summary.csv-o3_AQI Temperature
Correlation bounds: -0.11545603160451891, 0.06186103707872515
Bootstrap 95% confidence interval for correlation: (-0.06114979995508197, -0.026866817450155563)




Observed mutual info: -2.3883112706784242e-05
Observed correlation: -0.03914918161706282
pm10_daily_summary.csv-f0_ Temperature
Correlation bounds: -0.10866330292120387, 0.05937130428468072
Bootstrap 95% confidence interval for correlation: (-0.05528977419809841, -0.024862815696303654)




Observed mutual info: -0.00014289438414307352
Observed correlation: 0.005893026407901677
pm10_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.018429417805543195, 0.03173929084642047
Bootstrap 95% confidence interval for correlation: (-0.01658241759994904, 0.029556074814952094)


 11%|█         | 1/9 [00:12<01:40, 12.53s/it]

Observed mutual info: -0.0002314516017454152
Observed correlation: nan
pm10_daily_summary.csv-f2_ Temperature
Correlation bounds: -0.21309446465008616, 1
Bootstrap 95% confidence interval for correlation: (nan, nan)
Observed mutual info: 2.1851045462751074e-14
Observed correlation: nan
pm10_daily_summary.csv-f3_ Temperature
Correlation bounds: -0.23967798111469538, 1
Bootstrap 95% confidence interval for correlation: (nan, nan)
Observed mutual info: 2.1851045462751074e-14




Looking at table wind_daily_summary.csv
Observed correlation: nan
wind_daily_summary.csv-o3_AQI Temperature
Correlation bounds: nan, 1
Bootstrap 95% confidence interval for correlation: (nan, nan)
Observed mutual info: 0.0
Observed correlation: 0.03358381366422536
wind_daily_summary.csv-f0_ Temperature
Correlation bounds: 0.02441055139450897, 0.04296432743848804
Bootstrap 95% confidence interval for correlation: (0.014351032585254848, 0.051773930333897274)




Observed mutual info: -1.8682978758456347e-11
Observed correlation: 0.003008321845126484
wind_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.031124417287641513, 1
Bootstrap 95% confidence interval for correlation: (-0.016084635380300263, 0.02427605410363684)




Observed mutual info: 1.3314979491508711e-05
Observed correlation: 0.021169793370624496
wind_daily_summary.csv-f2_ Temperature
Correlation bounds: -0.040915982082976136, 1
Bootstrap 95% confidence interval for correlation: (-0.0019652266734028105, 0.037308272933057926)




Observed mutual info: -0.0025846572551631217
Observed correlation: 0.020835465774366072
wind_daily_summary.csv-f3_ Temperature
Correlation bounds: -0.026872112473665822, 1
Bootstrap 95% confidence interval for correlation: (0.008176620380310877, 0.04032281265950631)


 22%|██▏       | 2/9 [02:03<08:12, 70.40s/it]

Observed mutual info: -0.002391730627904191




Looking at table o3_daily_summary.csv
Observed correlation: -0.056468744739110274
o3_daily_summary.csv-o3_AQI Temperature
Correlation bounds: -0.07579287533530392, -0.03368396579521509
Bootstrap 95% confidence interval for correlation: (-0.07024471591336336, -0.044267662700648504)




Observed mutual info: 2.73706629349276e-13
Observed correlation: -0.052784311738532035
o3_daily_summary.csv-f0_ Temperature
Correlation bounds: -0.32541915416931805, 1
Bootstrap 95% confidence interval for correlation: (-0.07296113440131242, -0.03221498793402598)




Observed mutual info: -4.9198570242519565e-05
Observed correlation: 0.009020316713505346
o3_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.21829824670395678, 1.2302296628719918
Bootstrap 95% confidence interval for correlation: (-0.05924779172222178, 0.08101274261819007)




Observed mutual info: -9.954573203929172e-05
Observed correlation: 0.000769937617751846
o3_daily_summary.csv-f2_ Temperature
Correlation bounds: -0.2742661857401571, 1
Bootstrap 95% confidence interval for correlation: (-0.05350085156251279, 0.029202416301526736)




Observed mutual info: -0.0014776506051843838
Observed correlation: 0.0007768803510301071
o3_daily_summary.csv-f3_ Temperature
Correlation bounds: -0.11512670290723892, 0.152902423336307
Bootstrap 95% confidence interval for correlation: (-0.02592941677838349, 0.02930370683137936)


 33%|███▎      | 3/9 [02:16<04:24, 44.09s/it]

Observed mutual info: -0.0014865483988900502




Looking at table voc_daily_summary.csv
Observed correlation: nan
voc_daily_summary.csv-o3_AQI Temperature
Correlation bounds: nan, 1
Bootstrap 95% confidence interval for correlation: (nan, nan)
Observed mutual info: 1.5577442003468863e-14
Observed correlation: -0.010838253110246247
voc_daily_summary.csv-f0_ Temperature
Correlation bounds: -0.03438713389644127, 0.034243916558375886
Bootstrap 95% confidence interval for correlation: (-0.02343070336608131, 0.003653148013139436)




Observed mutual info: -1.9244022114719392e-05
Observed correlation: 0.010331934197039733
voc_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.009726473124621696, 0.031234419696338566
Bootstrap 95% confidence interval for correlation: (-0.008039448192126607, 0.030437457420234234)




Observed mutual info: -0.00037954415285493135
Observed correlation: 0.012799648571607317
voc_daily_summary.csv-f2_ Temperature
Correlation bounds: -0.0002267974541121581, 0.026188470711757648
Bootstrap 95% confidence interval for correlation: (-0.005464677861045814, 0.03240528905830769)




Observed mutual info: -0.00019748233417321963
Observed correlation: -0.020156587182494207
voc_daily_summary.csv-f3_ Temperature
Correlation bounds: -0.02895018422334143, -0.011141490555345492
Bootstrap 95% confidence interval for correlation: (-0.044016972370957134, -0.007390180003799225)


 44%|████▍     | 4/9 [02:34<02:48, 33.72s/it]

Observed mutual info: -0.0004347437796690843




Looking at table so2_daily_summary.csv
Observed correlation: -0.06410708887946108
so2_daily_summary.csv-o3_AQI Temperature
Correlation bounds: -0.07910660277637102, -0.04822221514417315
Bootstrap 95% confidence interval for correlation: (-0.0820386972223181, -0.046016961001113466)




Observed mutual info: -0.0016385602994498725
Observed correlation: -0.05624935793676445
so2_daily_summary.csv-f0_ Temperature
Correlation bounds: -0.07994931250317348, -0.030917256175858852
Bootstrap 95% confidence interval for correlation: (-0.06878029237904801, -0.04085136447458593)




Observed mutual info: -0.0005481666215502941
Observed correlation: -0.11394544424416524
so2_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.2205345160918243, 0.06303304699085803
Bootstrap 95% confidence interval for correlation: (-0.1745694620812126, -0.06148992084372222)




Observed mutual info: -0.0005142745501348692
Observed correlation: 0.07372249819109557
so2_daily_summary.csv-f2_ Temperature
Correlation bounds: -0.20443820888947103, 1
Bootstrap 95% confidence interval for correlation: (0.02911861534588367, 0.12936077801455617)




Observed mutual info: -0.0012806766029212454
Observed correlation: 0.07935361691803404
so2_daily_summary.csv-f3_ Temperature
Correlation bounds: -0.07128723509002234, 0.32119054562216903
Bootstrap 95% confidence interval for correlation: (0.04598654837406688, 0.12185528352101994)


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fc26144fa30>>
Traceback (most recent call last):
  File "/Users/korahughes/opt/anaconda3/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
 56%|█████▌    | 5/9 [02:48<01:46, 26.70s/it]

Observed mutual info: -0.0011329700729391927




Looking at table pressure_daily_summary.csv
Observed correlation: nan
pressure_daily_summary.csv-o3_AQI Temperature
Correlation bounds: nan, 1
Bootstrap 95% confidence interval for correlation: (nan, nan)
Observed mutual info: 0.0
Observed correlation: -0.0065938970351224185
pressure_daily_summary.csv-f0_ Temperature
Correlation bounds: -0.018864263077793368, 0.014531379507894054
Bootstrap 95% confidence interval for correlation: (-0.013211125211479604, 0.00211719746871057)




Observed mutual info: 2.3641769071378507e-05
Observed correlation: -0.01923605128771531
pressure_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.04123315802404758, 0.003882405126062972
Bootstrap 95% confidence interval for correlation: (-0.03582463052386845, -0.005528691845512197)




Observed mutual info: 0.00015342198678107807
Observed correlation: 0.014392513969415351
pressure_daily_summary.csv-f2_ Temperature
Correlation bounds: -0.11636791701036109, 1
Bootstrap 95% confidence interval for correlation: (0.0021678474156480144, 0.028014565361878904)




Observed mutual info: 0.007550775132855188
Observed correlation: 0.014244073817282038
pressure_daily_summary.csv-f3_ Temperature
Correlation bounds: -0.09787032583809592, 1
Bootstrap 95% confidence interval for correlation: (0.0015631981737950329, 0.02652100527208539)


 67%|██████▋   | 6/9 [03:10<01:15, 25.20s/it]

Observed mutual info: 0.007608755470272504




Looking at table pm25_frm_daily_summary.csv
Observed correlation: -0.013614382481296415
pm25_frm_daily_summary.csv-o3_AQI Temperature
Correlation bounds: -0.03023446143774428, 0.0038536720681605773
Bootstrap 95% confidence interval for correlation: (-0.031132527434547547, 0.009015621766981393)




Observed mutual info: 0.0001707945069032897
Observed correlation: -0.020770854857664924
pm25_frm_daily_summary.csv-f0_ Temperature
Correlation bounds: -0.04762435321839548, 0.0076798591360531234
Bootstrap 95% confidence interval for correlation: (-0.03646562648716269, 0.00265062298519421)




Observed mutual info: 0.0001028256408294433
Observed correlation: -0.001471537308224386
pm25_frm_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.04291926770947991, 0.0447174282441122
Bootstrap 95% confidence interval for correlation: (-0.021464734207469124, 0.021383658990718344)




Observed mutual info: 0.010235156796220888
Observed correlation: 0.0011359536941679058
pm25_frm_daily_summary.csv-f2_ Temperature
Correlation bounds: -0.021090086512302784, 0.024395297809172028
Bootstrap 95% confidence interval for correlation: (-0.021037882753290934, 0.029293367962550915)




Observed mutual info: 0.008393603939420402
Observed correlation: 0.007324441220242266
pm25_frm_daily_summary.csv-f3_ Temperature
Correlation bounds: -0.13468473267395228, 1
Bootstrap 95% confidence interval for correlation: (-0.02110514256811848, 0.02642181377413093)


 78%|███████▊  | 7/9 [03:28<00:45, 22.97s/it]

Observed mutual info: 0.004513386350869962




Looking at table rh_and_dp_daily_summary.csv
Observed correlation: nan
rh_and_dp_daily_summary.csv-o3_AQI Temperature
Correlation bounds: nan, 1
Bootstrap 95% confidence interval for correlation: (nan, nan)
Observed mutual info: 0.0
Observed correlation: 0.06520739230633721
rh_and_dp_daily_summary.csv-f0_ Temperature
Correlation bounds: 0.047324361675646714, 0.08399131142526603
Bootstrap 95% confidence interval for correlation: (0.050599942504888515, 0.08525040445092597)




Observed mutual info: -0.00039721247613746076
Observed correlation: 0.007271974598090751
rh_and_dp_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.04975896365984436, 0.07372524018274468
Bootstrap 95% confidence interval for correlation: (-0.014016570891512209, 0.04305323735885861)




Observed mutual info: 1.0473756353866099e-05
Observed correlation: 0.05233068608500297
rh_and_dp_daily_summary.csv-f2_ Temperature
Correlation bounds: 0.01037090819357573, 0.09814197214026668
Bootstrap 95% confidence interval for correlation: (0.04508964650910691, 0.05833935566461618)




Observed mutual info: -0.0024812598345906912
Observed correlation: 0.03655244063536243
rh_and_dp_daily_summary.csv-f3_ Temperature
Correlation bounds: -0.13517539664691186, 1
Bootstrap 95% confidence interval for correlation: (0.0023358502693944724, 0.05747217535042502)


 89%|████████▉ | 8/9 [03:50<00:22, 22.51s/it]

Observed mutual info: -0.0026840273725280534




Looking at table no2_daily_summary.csv
Observed correlation: 0.2408333239165361
no2_daily_summary.csv-o3_AQI Temperature
Correlation bounds: 0.23505997003914056, 0.24675527434779648
Bootstrap 95% confidence interval for correlation: (0.21876864591035164, 0.25754758832634433)




Observed mutual info: -5.32468792073918e-05
Observed correlation: 0.23851923796209837
no2_daily_summary.csv-f0_ Temperature
Correlation bounds: 0.23305328849475535, 0.24412448466182685
Bootstrap 95% confidence interval for correlation: (0.21997032558215746, 0.25562449446046126)




Observed mutual info: 1.8932090892928547e-05
Observed correlation: -0.0005330554369342888
no2_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.02023540393119542, 0.023889674289064727
Bootstrap 95% confidence interval for correlation: (-0.019372273624826627, 0.018225835481772236)




Observed mutual info: -4.727445097823487e-06
Observed correlation: 0.018518308778169266
no2_daily_summary.csv-f2_ Temperature
Correlation bounds: -0.03747019395393264, 1
Bootstrap 95% confidence interval for correlation: (-0.010693794043153347, 0.048091637360848934)




Observed mutual info: 0.0004304778805288345
Observed correlation: 0.017668130250040937
no2_daily_summary.csv-f3_ Temperature
Correlation bounds: -0.015686949104304362, 0.07565173598596169
Bootstrap 95% confidence interval for correlation: (-0.004569694339394844, 0.05384453530566701)


100%|██████████| 9/9 [05:37<00:00, 37.53s/it]

Observed mutual info: 0.00010612734488833767







### Actual Feature Selection

In [13]:
# instantiate model environment
model_target = 0
max_try_num = 7
# t_core.df_sketch.drop([target], axis=1, inplace=True)
print("\n\nDefining Environment")
env = ISOFAEnvironment(t_core, t_candidates, joinable, target, max_try_num)

# Parameters for the agent
learning_rate = 0.05
reward_decay = 0.9
e_greedy = 1
update_freq = 50
mem_cap = 1000
BDQN_batch_size = 3
print("Starting Training...")
autodata = Autofeature_agent(env, BDQN_batch_size, learning_rate, reward_decay, e_greedy, update_freq, mem_cap,
                                BDQN_batch_size)

print("\nAgent Ready!")

# Train the workload
auto_result = autodata.train_workload()



Defining Environment
<table.Table object at 0x7fc204203160>
<table.Table object at 0x7fc1d93620d0>
<table.Table object at 0x7fc1d9742d60>
<table.Table object at 0x7fc1da5611c0>
<table.Table object at 0x7fc1da56aa30>
<table.Table object at 0x7fc1d9360910>
<table.Table object at 0x7fc24e389a60>
<table.Table object at 0x7fc1dbb5dee0>
<table.Table object at 0x7fc1dbbd9280>


--------------------Init:--------------------
Model R2 score: 99.81777340062351%
Train RMSE score: 0.35508748568598825
Model R2 score: 99.2143440951382%
Test RMSE Score: 1.4689548539323016
Starting Training...


AttributeError: partially initialized module 'torch._dynamo' has no attribute 'external_utils' (most likely due to a circular import)

### Storing Results

In [None]:
res = pd.DataFrame(auto_result, columns=["RMSE", "Benefit", "Time", "Epsilon", "Features"])
res

## Visualizations

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt

rmse = [[row["RMSE"], i+1] for i, row in res.iterrows()]
df = pd.Dataframe(rmse, columns=["RMSE", "Step"])
px.scatter(df)