## LIBRERIAS y CONSTANTES

In [25]:
import numpy as np
import pandas as pd
import pickle
import torch.nn as nn
from torch.utils.data import DataLoader
import json, ast, sys, csv, random
import plotly.express as px
import math
import datetime

#Implement training process
from model_trees_algebra import NeoRegression

from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from functions.tree_format import IterateBuildTree, InnerJoinsIntraBGPS, \
                                IterateBuildTreeBetweenBGPS, TreeFormat
from functions.RL_functions import RL_Actions, RL_Initial_Step, RL_available_actions, \
                                    RL_Next_step, RL_Reward, RL_Rebuild_Dictionary

from functions.aux import MetricTotalAccuraccy


class BaoTrainingException(Exception):
    pass


### Basic parameters

In [35]:
URL = "/media/data/ccarmona/memoria/dataset/"
csv_name = 'new_dataset_6.4_subqueries'
x = [True,False]
active_new_data = x[0]
symbol = "ᶲ"
#learning_rate = 0.00001

#Este parametro sirve para elegir cierta cantidad de data ordenado por rangos de tiempo obtenidos.
## Entre más bajo menos data se seleccionara. Si es muy alto se tendran demasiados valores outliners, 
## pero si es muy bajo podría tenerse una data no representativa y se aumenta el riesgo de overfitting.
## Por otro lado min_data, simplemente da el valor minimo de tiempo de ejecución que tiene una consulta tomada
## en cuenta para hacer el modelo
#percent_of_data_or = 0.93
min_time_or = 50
max_time_or = 150
#percent_of_data = 1
min_time = 2
max_time = 100

In [36]:
def split_ds(all_data, val_rate, seed):
    """
    Used  to keep a balance of sets with respect to runtime of queries. 
    test_rate is a rate of the total,
    val_rate is a rate of the (total - test_rate)
    :param all_data: Pandas dataframe with data
    :param val_rate: Rate of the (total - test_rate)
    :param seed: For replication of results, this fixes the seed of split method. 
    :return: 
    """
    ranges = {}
    ranges['1_2'] = all_data[(all_data["time"] >= min_time)    & (all_data["time"] <= 2)]
    ranges['2_3'] = all_data[(all_data["time"] > 2)    & (all_data["time"] <= 3)]
    ranges['3_4'] = all_data[(all_data["time"] > 3)    & (all_data["time"] <= 4)]
    ranges['4_5'] = all_data[(all_data["time"] > 4)    & (all_data["time"] <= 5)]
    ranges['5_8'] = all_data[(all_data["time"] > 5)    & (all_data["time"] <= 8)]
    ranges['8_10'] = all_data[(all_data["time"] > 8)   & (all_data["time"] <= 10)]
    ranges['10_20'] =   all_data[(all_data["time"] > 10) & (all_data["time"] <= 20)]
    ranges['20_30'] =   all_data[(all_data["time"] > 20) & (all_data["time"] <= 30)]
    ranges['30_40'] =   all_data[(all_data["time"] > 30) & (all_data["time"] <= 40)]
    ranges['40_50'] =   all_data[(all_data["time"] > 40) & (all_data["time"] <= 50)]
    ranges['50_60'] =   all_data[(all_data["time"] > 50) & (all_data["time"] <= 60)]
    ranges['60_80'] =   all_data[(all_data["time"] > 60) & (all_data["time"] <= 80)]
    ranges['80_100'] =  all_data[(all_data["time"] > 80) & (all_data["time"] <= 100)]
    ranges['100_150'] = all_data[(all_data["time"] > 100) & (all_data["time"] <= 150)]
    ranges['150_200'] = all_data[(all_data["time"] > 150) & (all_data["time"] <= 200)]
    ranges['200_250'] = all_data[(all_data["time"] > 200) & (all_data["time"] <= 250)]
    ranges['250_450'] = all_data[(all_data["time"] > 250) & (all_data["time"] <= 450)]
    ranges['450_last'] = all_data[(all_data["time"] > 450)]
    train_data = []
    val_data = []
    for rang in ranges.values():
        if rang.shape[0] >= 3:
            X_train, X_val = train_test_split(
                rang, test_size=val_rate, shuffle=True,random_state=seed)

            train_data.append(X_train)
            val_data.append(X_val)
    train_data_list = pd.concat(train_data)
    val_data_list = pd.concat(val_data)
    #print("Shapes : Train: {} Val: {}".format(train_data_list.shape, val_data_list.shape))
    return train_data_list, val_data_list
def clear_error_tuples(x):
    try:
        json.loads(x)
        return True
    except:
        print("Error in data ignored!", x)
        return False

### Features

In [37]:
# Columns to use.
#list_columns = ['limit', 'group_by',
#       'distinct', 'order_by', 'union', 'left_join', 'join', 'iter', 'filter',
#       'num_filter', 'filter_eq', 'filter_gt', 'filter_ge', 'filter_lt',
#       'filter_le', 'filter_neq', 'filter_iri', 'filter_neq.1', 'filter_bound',
#       'filter_contains', 'filter_exists', 'filter_isBlank', 'filter_isIRI',
#       'filter_isLiteral', 'filter_lang', 'filter_langMatches', 'filter_not',
#       'filter_notexists', 'filter_regex', 'filter_sameTerm', 'filter_str',
#       'filter_strstarts', 'filter_or', 'filter_and', 'json_cardinality']

list_columns = ['total_bgps', 'triples', 'treesize', 'join', 'left_join']

## CLEAN DATA AND CREATE NEW DATASET TRAIN-TEST

In [38]:
df = pd.read_csv(URL + csv_name + ".csv", engine='python', encoding='utf-8')

In [39]:
df_raw = df.copy()

In [40]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [41]:
df_raw = df_raw[df_raw['time'] <= max_time_or]
df_raw = df_raw[df_raw['time'] >= min_time_or]


In [42]:
print("df_raw.shape", df_raw.shape)
df_raw['time'].value_counts(bins=10, sort=True, normalize=True)
#for i in df_raw['time'].value_counts(bins=100, sort=True, normalize=True):
#    print(i)


df_raw.shape (3285, 65)


(49.899, 60.0]    0.209741
(60.0, 70.0]      0.129680
(70.0, 80.0]      0.119635
(110.0, 120.0]    0.113546
(80.0, 90.0]      0.095282
(100.0, 110.0]    0.091629
(90.0, 100.0]     0.071233
(130.0, 140.0]    0.069406
(120.0, 130.0]    0.050533
(140.0, 150.0]    0.049315
Name: time, dtype: float64

In [43]:
df_raw['time'].describe()

count    3285.000000
mean       89.596651
std        28.834562
min        50.000000
25%        64.000000
50%        85.000000
75%       113.000000
max       150.000000
Name: time, dtype: float64

In [54]:
df_raw = df_raw.reset_index(drop=True)
f = open("copy_queries.txt", "a")
for i in range(len(df_raw)):
    f.write(df_raw['query'][i])
f.close()

In [11]:
ds_model, ds_rl_prev = split_ds(df_raw, 0.15,seed=None)
ds_rl_prev.columns

Index(['unique_id', 'filename', 'query', 'profile', 'limit', 'group_by',
       'distinct', 'order_by', 'union', 'left_join', 'join', 'iter', 'filter',
       'num_filter', 'filter_eq', 'filter_gt', 'filter_ge', 'filter_lt',
       'filter_le', 'filter_neq', 'filter_iri', 'filter_neq.1', 'filter_bound',
       'filter_contains', 'filter_exists', 'filter_isBlank', 'filter_isIRI',
       'filter_isLiteral', 'filter_lang', 'filter_langMatches', 'filter_not',
       'filter_notexists', 'filter_regex', 'filter_sameTerm', 'filter_str',
       'filter_strstarts', 'filter_or', 'filter_and', 'time', 'cpu_p', 'rnd',
       'seq', 'same_seg_p', 'same_page_p', 'disk_reads', 'read_ahead', 'wait',
       'comp_msec', 'comp_reads', 'comp_read_p', 'comp_messages', 'comp_clw',
       'triples', 'total_bgps', 'treesize', 'matrix_format', 'trees',
       'json_time_predicate', 'json_fanout_predicate',
       'json_input_rows_predicate', 'json_cardinality_fanout',
       'json_cardinality', 'scan_queries'

In [12]:
def subtree_format(df_raw):
    
    df_raw_unique_id = df_raw['unique_id']
    df_raw_filename = df_raw['filename']
    df_raw_query = df_raw['query']
    df_raw_json_cardinality = df_raw['json_cardinality']
    df_raw_subtrees = df_raw['matrix_subtrees']
    
    
    columns = ['unique_id', 'filename', 'query', 'trees', 'time', 'total_bgps', 'triples', 'treesize', 'join', 'left_join', 'iter', 'json_cardinality_original_query']
    values = []
    for dfrs in range(0,len(df_raw_subtrees)):
        unique_id = df_raw_unique_id[dfrs]
        filename = df_raw_filename[dfrs]
        query = df_raw_query[dfrs]
        json_cardinality = df_raw_json_cardinality[dfrs]
        lists_type = ast.literal_eval(df_raw_subtrees[dfrs])
        for ls in lists_type:
            str_subtree = str(ls[0]).replace('"', ';').replace("'", '"')
            row = [unique_id, filename, query, str_subtree] + ls[1:] + [json_cardinality]
            values.append(row)
            
    df_subtrees = pd.DataFrame(values, columns=columns)
    
    
    return df_subtrees
    

In [13]:
def json_cardinality_subtree(df_subtrees):
    dfsq_unique_id = df_subtrees['unique_id']
    dfsq_filename = df_subtrees['filename']
    dfsq_query = df_subtrees['query']
    dfsq_trees = df_subtrees['trees']
    dfsq_time = df_subtrees['time']
    dfsq_total_bgps = df_subtrees['total_bgps']
    dfsq_triples = df_subtrees['triples']
    dfsq_treesize = df_subtrees['treesize']
    dfsq_join = df_subtrees['join']
    dfsq_left_join = df_subtrees['left_join']
    dfsq_iter = df_subtrees['iter']
    dfsq_json_cardinality_original = df_subtrees['json_cardinality_original_query']
    columns = ['unique_id', 'filename', 'query', 'trees', 'time', 'total_bgps', 'triples', 'treesize', 'join', 'left_join', 'iter', 'json_cardinality_original_query', 'json_cardinality']
    values = []
    for df in range(len(dfsq_trees)):
        json_cardinality = {}
        tree_as_str = str(dfsq_trees[df])
        json_cardinality_original = ast.literal_eval(dfsq_json_cardinality_original[df])
        for k,v in json_cardinality_original.items():
            #fix = k.replace(';','"')
            if k in tree_as_str:
                json_cardinality[str(k)] = str(v)
        values.append(str(json_cardinality).replace('"', ';').replace("'", '"'))
    
    df_subtrees['json_cardinality'] = values
    
    return df_subtrees
        
        
        

In [14]:
ds_model = ds_model.reset_index()

In [15]:
df_subtrees = subtree_format(ds_model)

In [16]:
df_subtrees = json_cardinality_subtree(df_subtrees)

In [17]:
df_subtrees.shape

(8255, 13)

In [18]:
df_subtrees['time'] = df_subtrees.time.astype(float)
df_subtrees['time'].describe()

count    8255.000000
mean       26.725020
std        24.218160
min         0.002262
25%         0.840554
50%        27.000000
75%        43.213824
max        99.816736
Name: time, dtype: float64

In [19]:
print("mean", df_subtrees['time'].mean())
print("std", df_subtrees['time'].std())
print('df_raw.shape',df_subtrees.shape)
print("max", df_subtrees['time'].max())
#bins = []
#btw = 100
#for i in range(int(df_subtrees['time'].max())):
#    if i % btw == 0:
#        bins.append(i)
#bins.append(bins[-1] + btw)
#c = 0
#idx = 0
#for i in list(df_subtrees['time'].value_counts(bins=bins, sort=False, normalize=True)):
#    if c > percent_of_data:
#        break
#    c += i
#    idx += 1
#df_subtrees = df_subtrees[df_subtrees['time'] <=(bins[idx])]
df_subtrees = df_subtrees[df_subtrees['time'] <= max_time]
df_subtrees = df_subtrees[df_subtrees['time'] >= min_time]
df_subtrees = df_subtrees.reset_index(drop=True)

print("FIRST CLEAN")
print("mean", df_subtrees['time'].mean())
print("std", df_subtrees['time'].std())
print('df_raw.shape',df_subtrees.shape)
print("max", df_subtrees['time'].max())

mean 26.725019558740154
std 24.218160104413318
df_raw.shape (8255, 13)
max 99.816736
FIRST CLEAN
mean 37.491209490462886
std 19.78068593207435
df_raw.shape (5725, 13)
max 80.0


In [20]:
df_subtrees['time'].describe()

count    5725.000000
mean       37.491209
std        19.780686
min         2.004519
25%        26.000000
50%        36.000000
75%        51.000000
max        80.000000
Name: time, dtype: float64

In [21]:
ds_train_val_prev, ds_test_prev = split_ds(df_subtrees, 0.2,seed=None)
ds_train_prev, ds_val_prev = split_ds(ds_train_val_prev, 0.25,seed=None)

#ds_train_val_prev, ds_test_prev = split_ds(df_raw, 0.2,seed=None)
#ds_train_prev, ds_val_prev = split_ds(ds_train_val_prev, 0.25,seed=None)
#ds_rl_prev = ds_test_prev.copy()

#Remove bad rows
ds_train  = ds_train_prev[ds_train_prev['trees'].apply(lambda x: clear_error_tuples(x))]
ds_val  = ds_val_prev[ds_val_prev['trees'].apply(lambda x: clear_error_tuples(x))]
ds_test  = ds_test_prev[ds_test_prev['trees'].apply(lambda x: clear_error_tuples(x))]
ds_rl = ds_rl_prev[ds_rl_prev['trees'].apply(lambda x: clear_error_tuples(x))]

print("---------SHAPES-----------")
print("----------RAW-----------")
print(f'shape df_raw: {df_raw.shape}')
print("----------PREV----------")
print(f'shape ds_train_val_prev: {ds_train_val_prev.shape}')
print(f'shape ds_train_prev: {ds_train_prev.shape}')
print(f'shape ds_val_prev: {ds_val_prev.shape}')
print(f'shape ds_test_prev: {ds_test_prev.shape}')
print(f'shape ds_rl_prev: {ds_rl_prev.shape}')
print("----------CLEAN----------")
print(f'shape ds_train: {ds_train.shape}')
print(f'shape ds_val: {ds_val.shape}')
print(f'shape ds_test: {ds_test.shape}')
print(f'shape ds_rl: {ds_rl.shape}')

---------SHAPES-----------
----------RAW-----------
shape df_raw: (4192, 65)
----------PREV----------
shape ds_train_val_prev: (4576, 13)
shape ds_train_prev: (3429, 13)
shape ds_val_prev: (1147, 13)
shape ds_test_prev: (1149, 13)
shape ds_rl_prev: (631, 65)
----------CLEAN----------
shape ds_train: (3429, 13)
shape ds_val: (1147, 13)
shape ds_test: (1149, 13)
shape ds_rl: (631, 65)


In [22]:
#if active_new_data:
#    ds_train.to_csv(URL + csv_name + '_ds_train.csv')
#    ds_val.to_csv(URL + csv_name + '_ds_val.csv')
#    ds_test.to_csv(URL + csv_name + '_ds_test.csv')
#    ds_rl.to_csv(URL + csv_name + '_ds_rl.csv')
#    print("New csv generates")
#else:
#    print("Not csv generates")

# FUNCTIONS

In [23]:
list_0_1 = []
list_2_10 = []
list_10_20 = []
list_20_30 = []
list_30_40 = []
list_40_50 = []
list_50_60 = []
list_60_70 = []
list_70_80 = []
list_80_90 = []
list_90_100 = []
list_100_110 = []
list_110_120 = []
list_120 = []

for i in df_raw['time']:
    x = int(i)
    if x >= 0 and x < 2:
        list_0_1.append(x)
    if x >= 2 and x < 10:
        list_2_10.append(x)
    if x >= 10 and x < 20:
        list_10_20.append(x)
    if x >= 20 and x < 30:
        list_20_30.append(x)
    if x >= 30 and x < 40:
        list_30_40.append(x)
    if x >= 40 and x < 50:
        list_40_50.append(x)
    if x >= 50 and x < 60:
        list_50_60.append(x)
    if x >= 60 and x < 70:
        list_60_70.append(x)
    if x >= 70 and x < 80:
        list_70_80.append(x)
    if x >= 80 and x < 90:
        list_80_90.append(x)
    if x >= 90 and x < 100:
        list_90_100.append(x)
    if x >= 100 and x < 110:
        list_100_110.append(x)
    if x >= 110 and x < 120:
        list_110_120.append(x)
    if x > 120:
        list_120.append(x)
total = {
    "list_0_1" : list_0_1,
    "list_2_10" : list_2_10,
    "list_10_20" : list_10_20,
    "list_20_30" : list_20_30,
    "list_30_40" : list_30_40,
    "list_40_50" : list_40_50,
    "list_50_60" : list_50_60,
    "list_60_70" : list_60_70,
    "list_70_80" : list_70_80,
    "list_80_90" : list_80_90,
    "list_90_100" : list_90_100,
    "list_100_110" : list_100_110,
    "list_110_120" : list_110_120,
    "list_120" : list_120,
    }