In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from xgboost import plot_importance

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

from torch import nn
import neural_network
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('vectorizedQueries.csv', index_col=0)

In [3]:
seed = 100
test_size = 0.15
# 'actual_cardinality', 'postgres_cardinality', 

X = df.drop(columns=["actual_cardinality"], axis=1)
y = df["actual_cardinality"]

# note: stratify=df.buy generates
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=test_size, random_state=seed)
X_train2 = X_train.drop(['sql_query', 'postgres_cardinality'], axis=1)
X_test2 = X_test.drop(['sql_query', 'postgres_cardinality'], axis=1)

In [4]:
X_train2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
10810,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5561,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,6534376
12963,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5275,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4959,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### XGB Model

In [5]:
# xgb_model = XGBRegressor(max_depth=5, n_estimators = 2000, subsample=0.8, reg_lambda = 1)

xgb_model = XGBRegressor(max_depth=10, n_estimators = 1000, subsample=0.8, reg_lambda = 1)
xgb_model.fit(X_train2, y_train)

In [6]:
# some preds are negative
preds_tr = np.clip(xgb_model.predict(X_train2), a_min=0, a_max=None)
preds_te = np.clip(xgb_model.predict(X_test2), a_min=0, a_max=None)


In [7]:
mean_squared_error(y_train, preds_tr)

0.004161020613759705

In [8]:
mean_squared_error(y_train, X_train['postgres_cardinality'])

65214.556647058824

In [9]:
mean_squared_error(y_test, preds_te)

21036.917400328293

In [10]:
mean_squared_error(y_test, X_test['postgres_cardinality'])

63981.378666666664

### NN Model

In [11]:
len(pd.value_counts(X_train2['0']))

2

In [12]:
for i in X_train2.columns:
    if len(pd.value_counts(X_train2[i])) > 2:
        X_train2[i] = (X_train2[i] - X_train2[i].mean())/X_train2[i].std()

In [13]:
X_train2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
10810,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
5561,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,3.379759
12963,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
5275,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
4959,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
79,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
12119,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610
14147,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.312610


In [14]:
n, d = X_train2.shape
input_dim = d
hidden_dim = d
output_dim = 1
num_epochs = 10000
model_nn = neural_network.Model(input_dim, hidden_dim, output_dim)

model_nn = neural_network.train_regression_model(torch.tensor(X_train2.values, dtype=torch.float32), 
                                      torch.tensor(y_train.values, dtype=torch.float32), 
                                      model_nn, 
                                      num_epochs, 
                                      loss_fn = nn.MSELoss(),
                                      # loss_fn = losses.DiceLoss(),
                                      # loss_fn = nn.BCELoss(weight=torch.tensor(classes_weights)), 
                                      lr=1e-3, l2_reg=1e-2, print_freq=1000, display_loss=True)

epoch 1000 loss 4406147.0
epoch 2000 loss 2884199.75
epoch 3000 loss 1958583.5
epoch 4000 loss 1477067.75
epoch 5000 loss 1192816.375


In [15]:
model_nn.eval()
preds_tr_nn = model_nn(torch.tensor(X_train2.values, dtype=torch.float32)).detach().numpy().flatten()
preds_te_nn = model_nn(torch.tensor(X_test2.values, dtype=torch.float32)).detach().numpy().flatten()

In [16]:
print(mean_squared_error(y_train, preds_tr_nn))
print(mean_squared_error(y_test, preds_te_nn))

1192588.1711378484
1.3898428521285446e+16
