In [None]:
import git
import json
import time

import numpy as np
from scipy.stats import norm
import torch
from torch.utils.data import DataLoader

import data.data_utils as data_utils
import data.education_wage_original_data_generator as education_wage_original_data_generator
import models.model_fitting as model_fitting
import models.model_mlp as model_mlp
import utils

print("executing")


In [None]:
task_id = 1234
utils.set_random_seeds(task_id)

params = {
    "case_id": task_id,
    "n_train": 2990, 
    "n_validate": 0,
    "n_test": 0,
    "batch_size": 64,
    "decoder_hidden_sizes": [30, 30, 30, 30, 30],
    "learning_rate": 0.001,
    "max_epochs": 5000,
    "weight_decay": 0.01,
    "lr_reducer_patience": 50,
    "lr_reducer_factor": 0.1,
    "patience_in_epochs": 100,
    "adam_beta_1": 0.97,
    "adam_beta_2": 0.997,
    "max_training_time_s": 600000, 
    "outcome": "lwage",
    "uses_t_instead_of_w": True,
    "num_q_annealing_epochs": 1,
    "q_initial_weight": 1.0,
}

In [None]:


output = {}

time_start_s: float = time.monotonic()

data_generator = education_wage_original_data_generator.Generator(
    params["outcome"]
)

train_data = data_generator.generate_data(params["n_train"])
validate_data = (
    data_generator.generate_data(params["n_validate"])
    if params["n_validate"] > 0
    else train_data
)
train_dataloader = DataLoader(
    train_data, batch_size=params["batch_size"], shuffle=True
)

output["time_data_generated_s"] = time.monotonic() - time_start_s

time_start_s = time.monotonic()

model = model_mlp.Mlp(
    z_dim=train_data.z.shape[1],
    hidden_sizes=params["decoder_hidden_sizes"],
    uses_t_instead_of_w=params["uses_t_instead_of_w"],
)

output["time_model_created_s"] = time.monotonic() - time_start_s
time_start_s = time.monotonic()

output["was_run_interrupted_due_to_time"] = model_fitting.fit_model(
    model=model,
    train_dataloader=train_dataloader,
    validate_data=validate_data,
    learning_rate=params["learning_rate"],
    max_epocs=params["max_epochs"],
    weight_decay=params["weight_decay"],
    num_q_annealing_epochs=params["num_q_annealing_epochs"],
    q_initial_weight=params["q_initial_weight"],
    lr_reducer_patience=params["lr_reducer_patience"],
    lr_reducer_factor=params["lr_reducer_factor"],
    patience_in_epochs=params["patience_in_epochs"],
    adam_beta_1=params["adam_beta_1"],
    adam_beta_2=params["adam_beta_2"],
    max_training_time_s=params["max_training_time_s"],
)

output["time_model_trained_s"] = time.monotonic() - time_start_s


In [None]:

repo = git.Repo(search_parent_directories=True)
output["git_commit_hash"] = repo.head.object.hexsha

output["params"] = params

output["train_y_sd_estimate"] = model.get_y_sd_estimate(
        torch.from_numpy(train_data.z),
        torch.from_numpy(train_data.t),
        torch.from_numpy(train_data.y),
    )

output["validate_y_sd_estimate"] = model.get_y_sd_estimate(
        torch.from_numpy(validate_data.z),
        torch.from_numpy(validate_data.t),
        torch.from_numpy(validate_data.y),
    )

output["train_loss"] = (
    model.loss(
        batch=[
            torch.tensor(train_data.z),
            torch.tensor(train_data.t),
            torch.tensor(train_data.w),
            torch.tensor(train_data.y),
            None,
        ]
    )
    .detach()
    .item()
)
output["validate_loss"] = (
    model.loss(
        batch=[
            torch.tensor(validate_data.z),
            torch.tensor(validate_data.t),
            torch.tensor(validate_data.w),
            torch.tensor(validate_data.y),
            None,
        ]
    )
    .detach()
    .item()
)

out_dir = "data/education_wage_data/"
print("Output:")
print(json.dumps(output, sort_keys=True, indent=4))
with open(out_dir+"augmenting_model_output.json", "w") as f:
    json.dump(output, f, sort_keys=True, indent=4)

In [None]:
y_sd_multiplier = 0.1
w_sds = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]


In [None]:
torch.save(model, out_dir+"education_wage_augmenting_model.pickle")

In [None]:
utils.set_random_seeds(479)

y_noise_estimate = model.get_y_sd_estimate(
            torch.tensor(train_data.z),
            torch.tensor(train_data.t),
            torch.tensor(train_data.y),
        )

print("y_noise_estimate:")
print(y_noise_estimate)

y_sd = y_sd_multiplier*y_noise_estimate

z = data_generator.z
t = data_generator.t

mean_estimates = model(torch.tensor(z), torch.tensor(t)).detach()
print("mean_estimates:")
print(mean_estimates)

generated_y = np.random.normal(loc=mean_estimates, scale=y_sd).astype(np.float32)
print("generated_y:")
print(generated_y)

for w_sd in w_sds:
    noise = np.random.normal(0, w_sd * t.std(), t.shape).astype(np.float32)
    dataset = data_utils.MeDataset(z = z, t = t, w = t + noise.reshape((-1,1)), y = generated_y)
    torch.save(dataset, out_dir+f"education_wage_data_augmented_noise_{int(100*w_sd)}_percent.pickle")

In [None]:
with open(out_dir+"y_sd.txt", "w") as f:
    f.write(str(y_sd))

In [None]:
print("The number of available features: ", data_generator.z.shape[1] + 1)

In [None]:
print(dataset_no_noise.w)