# Import

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
from math import sqrt
from pickle import dump

from sklearn.preprocessing import MinMaxScaler, QuantileTransformer # OneHotEncoder, LabelEncoder, 

from data import *
from generation import *

from tvae import TVAE
from gan import GAN
from ctgan import CTGAN
from ddpm import DDPM
from tiny import TINY

In [8]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device is", device)

device is cuda


In [10]:
os.chdir('./Data')

# Datasets

In [13]:
df_raw = pd.read_csv("./raw_data.csv") 
df_raw.head()

Unnamed: 0,Coord X,Coord Y,Month,Day,Hour,Duration,Incident
0,573603.0,6280852.0,1,0,0,54.0,4
1,558522.0,6263928.0,1,0,0,32.0,18
2,574303.94,6279553.0,1,0,0,25.0,7
3,571710.7,6305442.5,1,0,0,167.0,27
4,569896.0,6265672.0,1,0,0,41.0,1


In [14]:
df_sincos = df_raw.copy()
df_sincos[["Month_sin", "Month_cos"]] = df_sincos["Month"].apply(lambda x: pd.Series(encode_periodic(x, 12)))
df_sincos[["Day_sin", "Day_cos"]] = df_sincos["Day"].apply(lambda x: pd.Series(encode_periodic(x, 365)))
df_sincos[["Hour_sin", "Hour_cos"]] = df_sincos["Hour"].apply(lambda x: pd.Series(encode_periodic(x, 24)))
df_sincos = df_sincos[['Coord X', 'Coord Y', "Month_sin", "Month_cos", "Day_sin", "Day_cos", "Hour_sin", "Hour_cos", 'Duration', 'Incident']]
df_sincos.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,573603.0,6280852.0,0.5,0.866025,0.0,1.0,0.0,1.0,54.0,4
1,558522.0,6263928.0,0.5,0.866025,0.0,1.0,0.0,1.0,32.0,18
2,574303.94,6279553.0,0.5,0.866025,0.0,1.0,0.0,1.0,25.0,7
3,571710.7,6305442.5,0.5,0.866025,0.0,1.0,0.0,1.0,167.0,27
4,569896.0,6265672.0,0.5,0.866025,0.0,1.0,0.0,1.0,41.0,1


# Training

## TVAE

In [15]:
df_prep_tvae = df_sincos.copy()

cols = ["Coord X", "Coord Y", "Duration"]

to_QT = df_prep_tvae[cols].values

normalizer_tvae = MinMaxScaler(feature_range=(-0.95, 0.95))

df_prep_tvae[cols] = normalizer_tvae.fit_transform(to_QT)

df_prep_tvae.to_csv("df_prep_tvae.csv", index=False, header=True)

dump(normalizer_tvae, open('normalizer_tvae.pkl', 'wb'))

df_prep_tvae.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,0.222084,0.483737,0.5,0.866025,0.0,1.0,0.0,1.0,-0.88035,4
1,0.00454,0.235693,0.5,0.866025,0.0,1.0,0.0,1.0,-0.915985,18
2,0.232195,0.464698,0.5,0.866025,0.0,1.0,0.0,1.0,-0.927323,7
3,0.194787,0.844144,0.5,0.866025,0.0,1.0,0.0,1.0,-0.697315,27
4,0.16861,0.261253,0.5,0.866025,0.0,1.0,0.0,1.0,-0.901407,1


In [16]:
params_tvae = {"epochs" : 5000,
                "batch_size" : 1024, 
                "embedding_dim" : 64,
                "compress_dims":(256,512),
                "decompress_dims":(256,512),
                "lr":1e-4,
                "l2scale":1e-5, 
                "loss_factor":2,
                "verbose" : True,
                "model_path" : "./model_tvae.pt",
                "device" : device}

In [17]:
tvae = TVAE(**params_tvae)

In [18]:
tvae.fit(df_prep_tvae, discrete_columns=["Incident"])

Deco. (4.22): 100%|██████████| 5000/5000 [03:27<00:00, 24.10it/s] 


## GAN

In [19]:
df_prep_gan = df_sincos.copy()

cols = ["Coord X", "Coord Y", "Duration", "Incident"]

to_QT = df_prep_gan[cols].values

normalizer_gan = MinMaxScaler(feature_range=(-0.95, 0.95)) # feature_range=(-0.95, 0.95)

df_prep_gan[cols] = normalizer_gan.fit_transform(to_QT)

df_prep_gan.to_csv("df_prep_gan.csv", index=False, header=True)

dump(normalizer_gan, open('normalizer_gan.pkl', 'wb'))

df_prep_gan.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,0.222084,0.483737,0.5,0.866025,0.0,1.0,0.0,1.0,-0.88035,-0.85
1,0.00454,0.235693,0.5,0.866025,0.0,1.0,0.0,1.0,-0.915985,-0.383333
2,0.232195,0.464698,0.5,0.866025,0.0,1.0,0.0,1.0,-0.927323,-0.75
3,0.194787,0.844144,0.5,0.866025,0.0,1.0,0.0,1.0,-0.697315,-0.083333
4,0.16861,0.261253,0.5,0.866025,0.0,1.0,0.0,1.0,-0.901407,-0.95


In [20]:
params_gan = {"epochs" : 5000,
                "batch_size" : 1024, 
                "embedding_dim" : 128,
                "hidden_dim" : 1024,
                "generator_lr" : 1e-4,
                "discriminator_lr" : 1e-4,
                "pac" : 64,
                "verbose" : True,
                "model_path" : "./model_gan.pt",
                "device" : device}


In [21]:
gan = GAN(**params_gan)

In [22]:
gan.fit(df_prep_gan)

Generator(
  (layer1): Linear(in_features=128, out_features=1024, bias=True)
  (layer2): Linear(in_features=1024, out_features=1024, bias=True)
  (layer3): Linear(in_features=1024, out_features=10, bias=True)
  (bn1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (leaky_relu): LeakyReLU(negative_slope=0.2)
)
Discriminator(
  (layer1): Linear(in_features=10, out_features=1024, bias=True)
  (layer2): Linear(in_features=1024, out_features=1024, bias=True)
  (layer3): Linear(in_features=1024, out_features=1, bias=True)
  (leaky_relu): LeakyReLU(negative_slope=0.2)
  (sigmoid): Sigmoid()
)


Gen. (1.80) | Discrim. (0.40): 100%|██████████| 5000/5000 [02:02<00:00, 40.77it/s]


## CTGAN

In [23]:
df_prep_ctgan = df_sincos.copy()

cols = ["Coord X", "Coord Y", "Duration"] 

to_QT = df_prep_ctgan[cols].values

normalizer_ctgan = QuantileTransformer(
        output_distribution='normal',
        n_quantiles=2000, # max(min(to_QT.shape[0] // 30, 1000), 10),
        subsample=int(1e9),
        random_state=42)

df_prep_ctgan[cols] = normalizer_ctgan.fit_transform(to_QT)

df_prep_ctgan.to_csv("df_prep_ctgan.csv", index=False, header=True)

dump(normalizer_ctgan, open('normalizer_ctgan.pkl', 'wb'))

df_prep_ctgan.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,0.18701,0.617517,0.5,0.866025,0.0,1.0,0.0,1.0,-0.700297,4
1,-0.927791,-0.747918,0.5,0.866025,0.0,1.0,0.0,1.0,-1.533866,18
2,0.336667,0.315508,0.5,0.866025,0.0,1.0,0.0,1.0,-1.887973,7
3,-0.082914,2.267025,0.5,0.866025,0.0,1.0,0.0,1.0,1.674425,27
4,-0.307887,-0.684045,0.5,0.866025,0.0,1.0,0.0,1.0,-1.153697,1


In [24]:
hidden_dim = 1024

params_ctgan = {"epochs" : 5000,
                "batch_size" : 1024, # % pac
                "pac" : 64, # bs 
                "embedding_dim" : 128,
                "generator_dim" : (hidden_dim, hidden_dim),
                "discriminator_dim" : (hidden_dim, hidden_dim),
                "generator_lr" : 1e-4,
                "discriminator_lr" : 1e-4,
                "verbose" : True,
                "model_path" : "./model_ctgan.pt",
                "device" : device}

In [25]:
ctgan = CTGAN(**params_ctgan)

In [26]:
ctgan.fit(df_prep_ctgan, discrete_columns=["Incident"])

Gen. (-0.66) | Discrim. (0.36): 100%|██████████| 5000/5000 [07:54<00:00, 10.54it/s] 


## DDPM

In [27]:
df_prep_ddpm = df_sincos.copy()

cols = ["Coord X", "Coord Y", "Duration"] 

to_QT = df_prep_ddpm[cols].values

normalizer_ddpm = QuantileTransformer(
        output_distribution='normal',
        n_quantiles=1000, # max(min(to_QT.shape[0] // 30, 1000), 10),
        subsample=int(1e9),
        random_state=42)

df_prep_ddpm[cols] = normalizer_ddpm.fit_transform(to_QT)

df_prep_ddpm.to_csv("df_prep_ddpm.csv", index=False, header=True)

dump(normalizer_ddpm, open('normalizer_ddpm.pkl', 'wb'))

df_prep_ddpm.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,0.187646,0.618059,0.5,0.866025,0.0,1.0,0.0,1.0,-0.700711,4
1,-0.92804,-0.748038,0.5,0.866025,0.0,1.0,0.0,1.0,-1.537691,18
2,0.336178,0.31633,0.5,0.866025,0.0,1.0,0.0,1.0,-1.887753,7
3,-0.082526,2.269421,0.5,0.866025,0.0,1.0,0.0,1.0,1.674186,27
4,-0.30793,-0.683996,0.5,0.866025,0.0,1.0,0.0,1.0,-1.154615,1


In [28]:
df_prep_ddpm["Incident"] -= 1 # for tabddpm
df_prep_ddpm.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,0.187646,0.618059,0.5,0.866025,0.0,1.0,0.0,1.0,-0.700711,3
1,-0.92804,-0.748038,0.5,0.866025,0.0,1.0,0.0,1.0,-1.537691,17
2,0.336178,0.31633,0.5,0.866025,0.0,1.0,0.0,1.0,-1.887753,6
3,-0.082526,2.269421,0.5,0.866025,0.0,1.0,0.0,1.0,1.674186,26
4,-0.30793,-0.683996,0.5,0.866025,0.0,1.0,0.0,1.0,-1.154615,0


In [29]:
dataset = raw_dataset_from_df(df_prep_ddpm, [], dummy = False, col = "Incident")

In [30]:
params_ddpm = {"epochs" : 20000,
                "batch_size" : 4096, # % pac
                "num_timesteps" : 1000, # bs 
                "layers" : 1024,
                "lr" : 0.0025,
                "dim_t" : 128,
                "weight_decay" : 0, # 1e-05,
                "model_name" : "mlp", # "mlp"
                "gaussian_loss_type" : "mse", # "mse", "kl"
                "multinomial_loss_type" : 'vb_stochastic', # 'vb_stochastic', 'vb_all'
                "parametrization" : 'x0', # 'x0', 'direct'
                "scheduler" : "cosine", # "cosine", "linear"
                "is_y_cond": True,
                "verbose" : True,
                "model_path" : "./model_ddpm.pt",
                "device" : device}

In [31]:
ddpm = DDPM(**params_ddpm)

In [32]:
ddpm.fit(dataset)

K [0]
9
{'d_in': 9, 'is_y_cond': True, 'num_classes': 58, 'rtdl_params': {'d_layers': [1024, 1024], 'dropout': 0.0}, 'dim_t': 128}
mlp
label embedding Embedding(58, 128)
diffusion ready


mloss (0.00) | gloss (0.28): 100%|██████████| 20000/20000 [10:49<00:00, 30.77it/s]


## TINY

In [33]:
df_prep_tiny = df_sincos.copy()

cols = ["Coord X", "Coord Y", "Duration", "Incident"] # , "Incident"

to_QT = df_prep_tiny[cols].values

normalizer_tiny = sklearn.preprocessing.QuantileTransformer(
                    output_distribution='normal',
                    n_quantiles=2000, #max(min(to_QT.shape[0] // 30, 1000), 10),
                    subsample=int(1e9),
                    random_state=42)

df_prep_tiny[cols] = normalizer_tiny.fit_transform(to_QT)

df_prep_tiny.to_csv("df_prep_tiny.csv", index=False, header=True)

dump(normalizer_tiny, open('normalizer_tiny.pkl', 'wb'))

df_prep_tiny.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,0.18701,0.617517,0.5,0.866025,0.0,1.0,0.0,1.0,-0.700297,-0.243131
1,-0.927791,-0.747918,0.5,0.866025,0.0,1.0,0.0,1.0,-1.533866,0.708332
2,0.336667,0.315508,0.5,0.866025,0.0,1.0,0.0,1.0,-1.887973,0.116251
3,-0.082914,2.267025,0.5,0.866025,0.0,1.0,0.0,1.0,1.674425,1.037185
4,-0.307887,-0.684045,0.5,0.866025,0.0,1.0,0.0,1.0,-1.153697,-5.199338


In [34]:
params_tiny = {"epochs" : 5000,
                "batch_size" : 4096, # % pac
                "num_timesteps" : 1000, # bs 
                "lr" : 0.0025,
                "hidden_size" : 1024,
                "hidden_layers" : 3,
                "embedding_size" : 128,
                "time_embedding" : "sinusoidal",
                "input_embedding" : "sinusoidal",
                "scale" : 2.0,
                "verbose" : True,
                "model_path" : "./model_tiny.pt",
                "device" : device}

In [35]:
tiny = TINY(**params_tiny)

In [36]:
tiny.fit(df_prep_tiny)

  0%|          | 0/5000 [00:00<?, ?it/s]