# Main Visualization File
> Most of the inital code is ported from main.py

In [4]:
!pip install mmh3
!pip install torch

Collecting torch
  Downloading torch-2.2.2-cp39-none-macosx_10_9_x86_64.whl (150.8 MB)
[K     |████████████████████████████████| 150.8 MB 37.6 MB/s eta 0:00:01
[?25hCollecting typing-extensions>=4.8.0
  Downloading typing_extensions-4.11.0-py3-none-any.whl (34 kB)
Installing collected packages: typing-extensions, torch
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 4.1.1
    Uninstalling typing-extensions-4.1.1:
      Successfully uninstalled typing-extensions-4.1.1
Successfully installed torch-2.2.2 typing-extensions-4.11.0


In [5]:
from table import Table
from sketches import Synopsis
import correlation
from RL_Agent import Autofeature_agent
from RL_Environment import ISOFAEnvironment
import pandas as pd
import numpy as np
import sketches
import os
from tqdm import tqdm

## Model Training

In [6]:
gcdata = "data/gc-data/"  # data directory
joinable = "DayLoc"  # feature that is joinable between tables
target = "Temperature"
# TODO: drop Date and Location on on the csv's

# define core table
print("Sketching Core Table...")
core_path = gcdata+"temp.csv"
t_core = Table(joinable, core_path)

y_feat = t_core.table[[joinable, target]]
t_core.table.drop([target], axis=1, inplace=True)
print(t_core.table.shape)

t_core.get_sketch()
core_syn = sketches.Synopsis(y_feat, attributes=[target], key=joinable) 
t_core.calc_corr_gain(core_syn)

Sketching Core Table...
(11959, 5)
Observed correlation: 0.9967236563887053
f0_ Temperature
Correlation bounds: 0.9914217299462094, 1.0023880394688534
Bootstrap 95% confidence interval for correlation: (0.9966099209170419, 0.9969456324960341)




Observed mutual info: 1.3934741864764607e-05
Observed correlation: 0.46844979302250733
f1_ Temperature
Correlation bounds: 0.35870779209686204, 0.6862245753802713
Bootstrap 95% confidence interval for correlation: (0.4417608372611969, 0.48294171228194777)




Observed mutual info: 1.5329966930149245e-06
Observed correlation: -0.04337375987667828
f2_ Temperature
Correlation bounds: -0.0560689785493934, 1
Bootstrap 95% confidence interval for correlation: (-0.06703062313611156, -0.019530919278972436)




Observed mutual info: 1.5582162168576519e-07
Observed correlation: -0.04306752506070069
f3_ Temperature
Correlation bounds: -0.06967795385932417, 1
Bootstrap 95% confidence interval for correlation: (-0.0660572668140623, -0.011198263063210055)




Observed mutual info: 2.2326064657877914e-07


### Instantiating Candidates

In [7]:
# define candidate tables
candidate_paths = [file for file in os.listdir(gcdata) if "temp" not in file]
t_candidates = []
for path in tqdm(candidate_paths):
    print("\n\nLooking at table", path)
    t_cand = Table(joinable, gcdata+path)
    # get rid of target variable in candidate table
    if target in t_cand.table.columns:
        t_cand.table.drop([target], axis=1, inplace=True)
    assert joinable in t_cand.table.columns, f"{joinable=} not found in {path}"
    # rename columns for less confusion on join
    renamer = dict([[col, path+'-'+col] for col in t_cand.table.columns if joinable not in col])
    t_cand.table = t_cand.table.rename(columns=renamer)
    
    # use synopsys for join estimation
    t_cand.get_sketch()  # ? sketch candidate table again
    # TODO: check missing values in join of core with voc_daily_summary.csv (nan vals in calc mutual info)
    t_cand.calc_corr_gain(core_syn)  # ? calculate correlation between candidate and itself
    # ? get feature-wise sketch
    t_cand.feature_scoring(20)
    for feat in t_core.df_sketch:
        if t_core.df_sketch[feat].dtype == 'object':
            t_core.df_sketch[feat] = t_core.df_sketch[feat].astype('category')
    for feat in t_cand.df_sketch:
        if t_cand.df_sketch[feat].dtype == 'object':
            t_cand.df_sketch[feat] = t_cand.df_sketch[feat].astype('category')
    t_candidates.append(t_cand)
    print('\n')

  0%|          | 0/9 [00:00<?, ?it/s]



Looking at table pm10_daily_summary.csv
Observed correlation: -0.04052077184093939
pm10_daily_summary.csv-o3_AQI Temperature
Correlation bounds: -0.11545603160451891, 0.06186103707872515
Bootstrap 95% confidence interval for correlation: (-0.06114979995508197, -0.026866817450155563)




Observed mutual info: -2.3883112706784242e-05
Observed correlation: -0.03914918161706282
pm10_daily_summary.csv-f0_ Temperature
Correlation bounds: -0.10866330292120387, 0.05937130428468072
Bootstrap 95% confidence interval for correlation: (-0.05528977419809841, -0.024862815696303654)




Observed mutual info: -0.00014289438414307352
Observed correlation: 0.005893026407901677
pm10_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.018429417805543195, 0.03173929084642047
Bootstrap 95% confidence interval for correlation: (-0.01658241759994904, 0.029556074814952094)


 11%|█         | 1/9 [00:12<01:40, 12.53s/it]

Observed mutual info: -0.0002314516017454152
Observed correlation: nan
pm10_daily_summary.csv-f2_ Temperature
Correlation bounds: -0.21309446465008616, 1
Bootstrap 95% confidence interval for correlation: (nan, nan)
Observed mutual info: 2.1851045462751074e-14
Observed correlation: nan
pm10_daily_summary.csv-f3_ Temperature
Correlation bounds: -0.23967798111469538, 1
Bootstrap 95% confidence interval for correlation: (nan, nan)
Observed mutual info: 2.1851045462751074e-14




Looking at table wind_daily_summary.csv
Observed correlation: nan
wind_daily_summary.csv-o3_AQI Temperature
Correlation bounds: nan, 1
Bootstrap 95% confidence interval for correlation: (nan, nan)
Observed mutual info: 0.0
Observed correlation: 0.03358381366422536
wind_daily_summary.csv-f0_ Temperature
Correlation bounds: 0.02441055139450897, 0.04296432743848804
Bootstrap 95% confidence interval for correlation: (0.014351032585254848, 0.051773930333897274)




Observed mutual info: -1.8682978758456347e-11
Observed correlation: 0.003008321845126484
wind_daily_summary.csv-f1_ Temperature
Correlation bounds: -0.031124417287641513, 1
Bootstrap 95% confidence interval for correlation: (-0.016084635380300263, 0.02427605410363684)




### Actual Feature Selection

In [None]:
# instantiate model environment
model_target = 0
max_try_num = 7
# t_core.df_sketch.drop([target], axis=1, inplace=True)
print("\n\nDefining Environment")
env = ISOFAEnvironment(t_core, t_candidates, joinable, target, max_try_num)

# Parameters for the agent
learning_rate = 0.05
reward_decay = 0.9
e_greedy = 1
update_freq = 50
mem_cap = 1000
BDQN_batch_size = 3
print("Starting Training...")
autodata = Autofeature_agent(env, BDQN_batch_size, learning_rate, reward_decay, e_greedy, update_freq, mem_cap,
                                BDQN_batch_size)

print("\nAgent Ready!")

# Train the workload
auto_result = autodata.train_workload()

In [None]:
# TODO: store training results here
rmse_t = []


## Visualizations

In [None]:
import plotly.express as px
import matplotlib.pyplot as plt

rmse = [[rmse_t, i+1] for i in range(len(rmse_t))]
df = pd.Dataframe(rmse, columns=["RMSE", "Step"])
df

In [None]:
px.scatter(df)