In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from xgboost import plot_importance

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

from torch import nn
import neural_network
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('vectorizedQueries.csv', index_col=0)

In [3]:
seed = 100
test_size = 0.15
# 'actual_cardinality', 'postgres_cardinality', 

X = df.drop(columns=["actual_cardinality"], axis=1)
y = df["actual_cardinality"]

# note: stratify=df.buy generates
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=test_size, random_state=seed)
X_train2 = X_train.drop(['sql_query', 'postgres_cardinality'], axis=1)
X_test2 = X_test.drop(['sql_query', 'postgres_cardinality'], axis=1)

In [4]:
X_train2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
10810,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5561,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,6534376
12963,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5275,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4959,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### XGB Model

In [5]:
# xgb_model = XGBRegressor(max_depth=5, n_estimators = 2000, subsample=0.8, reg_lambda = 1)

xgb_model = XGBRegressor(max_depth=10, n_estimators = 1000, subsample=0.8, reg_lambda = 1)
xgb_model.fit(X_train2, y_train)

In [6]:
# some preds are negative
preds_tr = np.clip(xgb_model.predict(X_train2), a_min=0, a_max=None)
preds_te = np.clip(xgb_model.predict(X_test2), a_min=0, a_max=None)


In [7]:
mean_squared_error(y_train, preds_tr)

0.004161020613759705

In [8]:
mean_squared_error(y_train, X_train['postgres_cardinality'])

65214.556647058824

In [9]:
mean_squared_error(y_test, preds_te)

21036.917400328293

In [10]:
mean_squared_error(y_test, X_test['postgres_cardinality'])

63981.378666666664

### NN Model

In [11]:
len(pd.value_counts(X_train2['0']))

2

In [12]:
for i in X_train2.columns:
    if len(pd.value_counts(X_train2[i])) > 2:
        X_train2[i] = (X_train2[i] - X_train2[i].mean())/X_train2[i].std()

In [13]:
X_train2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
10810,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
5561,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,3.379759
12963,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
5275,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
4959,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
79,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
12119,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
14147,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610


In [14]:
n, d = X_train2.shape
input_dim = d
hidden_dim = d
output_dim = 1
num_epochs = 10000
model_nn = neural_network.Model(input_dim, hidden_dim, output_dim)

model_nn = neural_network.train_regression_model(torch.tensor(X_train2.values, dtype=torch.float32), 
                                      torch.tensor(y_train.values, dtype=torch.float32), 
                                      model_nn, 
                                      num_epochs, 
                                      loss_fn = nn.MSELoss(),
                                      # loss_fn = losses.DiceLoss(),
                                      # loss_fn = nn.BCELoss(weight=torch.tensor(classes_weights)), 
                                      lr=1e-3, l2_reg=1e-2, print_freq=1000, display_loss=True)

epoch 1000 loss 5030121.0
epoch 2000 loss 2811863.0
epoch 3000 loss 1983645.5
epoch 4000 loss 1604014.75
epoch 5000 loss 1348431.625
epoch 6000 loss 1150774.375
epoch 7000 loss 998199.875
epoch 8000 loss 904863.0
epoch 9000 loss 861229.625
epoch 10000 loss 842840.4375


In [15]:
model_nn.eval()
preds_tr_nn = model_nn(torch.tensor(X_train2.values, dtype=torch.float32)).detach().numpy().flatten()
preds_te_nn = model_nn(torch.tensor(X_test2.values, dtype=torch.float32)).detach().numpy().flatten()

In [16]:
print(mean_squared_error(y_train, preds_tr_nn))
print(mean_squared_error(y_test, preds_te_nn))

842828.2711385464
3061825031937699.0


In [19]:
X_train['actual_cardinality'] = y_train
X_train['XGB_cardinality'] = preds_tr
X_train['nn_cardinality'] = preds_tr_nn

In [22]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,postgres_cardinality,sql_query,actual_cardinality,XGB_cardinality,nn_cardinality
10810,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,4078,"SELECT * FROM city, country WHERE country_code...",4078,4078.034424,3900.600830
5561,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,6534376,1,SELECT * FROM country WHERE country.capital !=...,1,1.022614,1.878058
12963,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,177,SELECT * FROM country WHERE country.gnp_old <=...,177,177.024185,36.194050
5275,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,177,SELECT * FROM country WHERE country.gnp_old !=...,178,177.966522,963.511841
4959,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,51,"SELECT * FROM city, country WHERE country_code...",615,614.983215,1115.785767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,1,"SELECT * FROM city, country WHERE country_code...",1,0.961005,152.897675
79,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,3942,"SELECT * FROM city, country WHERE country_code...",4079,4079.024658,3792.045654
12119,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,SELECT * FROM city WHERE city.id = 2024,1,0.981665,34.478500
14147,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,3021,"SELECT * FROM city, country WHERE country_code...",3671,3670.950684,2757.372559


In [23]:
X_test['actual_cardinality'] = y_test
X_test['XGB_cardinality'] = preds_te
X_test['nn_cardinality'] = preds_te_nn

In [24]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,postgres_cardinality,sql_query,actual_cardinality,XGB_cardinality,nn_cardinality
2917,1,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,9,"SELECT * FROM city, country WHERE country_code...",6,5.906070,5.639770e+07
2234,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,3891,"SELECT * FROM city, country WHERE country_code...",3792,3781.121826,2.582096e+04
14396,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1632,216,SELECT * FROM country WHERE country.gnp_old <=...,216,236.449890,3.228973e+07
1781,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1619,1618,SELECT * FROM city WHERE city.id < 1940 AND ci...,1618,1370.534302,4.162102e+04
11102,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,68,SELECT * FROM country WHERE country.capital >=...,67,49.320698,1.829471e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,3030,"SELECT * FROM city, country WHERE country_code...",3030,3017.067871,7.416139e+03
4303,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,226,SELECT * FROM country WHERE country.capital <=...,226,224.234390,2.330930e+04
14292,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,1099578,3872,SELECT * FROM city WHERE city.population != 18...,3871,4141.564941,2.946772e+07
9954,0,1,0,1,0,0,0,0,0,0,...,0,0,1,0,1082578,4023,SELECT * FROM city WHERE city.id >= 1089 OR ci...,4011,3787.303955,2.968349e+07


In [25]:
X_train.to_csv('train_set.csv', index=False)
X_test.to_csv('test_set.csv', index=False)