# Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
from math import sqrt
from pickle import load

from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

from data import *
from generation import *

from tvae import TVAE
from gan import GAN
from ctgan import CTGAN
from ddpm import DDPM
from tiny import TINY

import time

In [2]:
import gc
gc.collect()

0

In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device is", device)

device is cuda


In [4]:
os.chdir('./Data')

df_raw = pd.read_csv("./raw_data.csv") 
df_raw.head()

Unnamed: 0,Coord X,Coord Y,Month,Day,Hour,Duration,Incident
0,573603.0,6280852.0,1,0,0,54.0,4
1,558522.0,6263928.0,1,0,0,32.0,18
2,574303.94,6279553.0,1,0,0,25.0,7
3,571710.7,6305442.5,1,0,0,167.0,27
4,569896.0,6265672.0,1,0,0,41.0,1


# Sampling

In [5]:
num_samples = len(df_raw) * 3 # at least x3 for resampling

## TVAE

In [6]:
df_prep_tvae = pd.read_csv("./df_prep_tvae.csv") 

params_tvae = {"epochs" : 5000,
                "batch_size" : 1024, 
                "embedding_dim" : 64,
                "compress_dims":(256,512),
                "decompress_dims":(256,512),
                "lr":1e-4,
                "l2scale":1e-5, 
                "loss_factor":2,
                "verbose" : True,
                "model_path" : "./model_tvae.pt",
                "device" : device}

tvae = TVAE(**params_tvae)

In [7]:
# Create synthetic data
start_time = time.time()
data = tvae.sample(df_prep_tvae, num_samples, discrete_columns=["Incident"])
tvae_time = time.time() - start_time
print(tvae_time)

19.411877155303955


In [8]:
df_res = pd.DataFrame(data, columns = df_prep_tvae.columns)

In [9]:
cols = ["Coord X", "Coord Y", "Duration"] 

normalizer_tvae = load(open('normalizer_tvae.pkl', 'rb'))

QT_inv = normalizer_tvae.inverse_transform(df_res[cols].values)
df_tvae = pd.DataFrame(data = QT_inv, columns = cols)
df_tvae["Incident"] = df_res["Incident"]

In [10]:
df_tvae.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident
0,575230.290009,6276768.0,51.876606,1
1,571197.537046,6280447.0,45.656005,1
2,573961.11837,6279698.0,99.980189,1
3,572484.883149,6279668.0,65.107954,1
4,574257.750364,6281422.0,45.441084,1


In [11]:
# df_res = reverse_df_minmax(df_to_process, cols_to_unminmax, df_raw)
df_tvae["Month"] = df_res.apply(lambda row: decode_periodic(row['Month_sin'], row['Month_cos'], 12), axis=1)
df_tvae["Day"] = df_res.apply(lambda row: decode_periodic(row['Day_sin'], row['Day_cos'], 365), axis=1)
df_tvae["Hour"] = df_res.apply(lambda row: decode_periodic(row['Hour_sin'], row['Hour_cos'], 24), axis=1)
df_tvae["Duration"] = df_tvae["Duration"].astype(int)
df_tvae["Incident"] = df_tvae["Incident"].astype(int)

df_tvae.loc[df_tvae["Month"] < 1, "Month"] = 12
df_tvae.loc[df_tvae["Day"] > 364, "Day"] = 0
df_tvae.loc[df_tvae["Hour"] > 23, "Hour"] = 0
df_tvae.loc[df_tvae["Incident"] <= 0, "Incident"] = 1
df_tvae.loc[df_tvae["Incident"] > 58, "Incident"] = 58
df_tvae.loc[df_tvae["Duration"] < 11, "Duration"] = 11

df_tvae.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Day,Hour
0,575230.290009,6276768.0,51,1,2,59,21
1,571197.537046,6280447.0,45,1,6,169,21
2,573961.11837,6279698.0,99,1,11,152,15
3,572484.883149,6279668.0,65,1,8,115,21
4,574257.750364,6281422.0,45,1,7,210,23


In [12]:
df_tvae.shape

(160401, 7)

In [13]:
df_tvae.to_csv("tvae.csv", index=False, header=True)

## GAN

In [14]:
df_prep_gan = pd.read_csv("./df_prep_gan.csv") 

params_gan = {"epochs" : 5000,
                "batch_size" : 1024, 
                "embedding_dim" : 128,
                "hidden_dim" : 1024,
                "generator_lr" : 1e-4,
                "discriminator_lr" : 1e-4,
                "pac" : 64,
                "verbose" : True,
                "model_path" : "./model_gan.pt",
                "device" : device}

gan = GAN(**params_gan)

In [15]:
# Create synthetic data
start_time = time.time()
data = gan.sample(df_prep_gan, num_samples).cpu().detach().numpy()
gan_time = time.time() - start_time
print(gan_time)

0.13467907905578613


In [16]:
df_res = pd.DataFrame(data, columns = df_prep_gan.columns)
cols = ["Coord X", "Coord Y", "Duration", "Incident"] 

normalizer_gan = load(open('normalizer_gan.pkl', 'rb'))

QT_inv = normalizer_gan.inverse_transform(df_res[cols].values)
df_gan = pd.DataFrame(data = QT_inv, columns = cols)

In [17]:
df_gan.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident
0,570622.25,6284536.5,43.748409,5.794269
1,565095.0,6245780.0,77.060852,1.427055
2,581810.75,6282982.0,119.061272,17.861954
3,563976.125,6282531.0,86.326668,20.384741
4,559966.9375,6264186.5,138.104141,22.770052


In [18]:
# df_res = reverse_df_minmax(df_to_process, cols_to_unminmax, df_raw)
df_gan["Month"] = df_res.apply(lambda row: decode_periodic(row['Month_sin'], row['Month_cos'], 12), axis=1)
df_gan["Day"] = df_res.apply(lambda row: decode_periodic(row['Day_sin'], row['Day_cos'], 365), axis=1)
df_gan["Hour"] = df_res.apply(lambda row: decode_periodic(row['Hour_sin'], row['Hour_cos'], 24), axis=1)
df_gan["Duration"] = df_gan["Duration"].astype(int)
df_gan["Incident"] = df_gan["Incident"].astype(int)

df_gan.loc[df_gan["Month"] < 1, "Month"] = 12
df_gan.loc[df_gan["Day"] > 364, "Day"] = 0
df_gan.loc[df_gan["Hour"] > 23, "Hour"] = 0
df_gan.loc[df_gan["Incident"] <= 0, "Incident"] = 1
df_gan.loc[df_gan["Incident"] > 58, "Incident"] = 58
df_gan.loc[df_gan["Duration"] < 11, "Duration"] = 11

df_gan.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Day,Hour
0,570622.25,6284536.5,43,5,4,110,13
1,565095.0,6245780.0,77,1,3,84,10
2,581810.75,6282982.0,119,17,9,259,1
3,563976.125,6282531.0,86,20,3,66,20
4,559966.9375,6264186.5,138,22,9,249,19


In [19]:
df_gan.shape

(160401, 7)

In [20]:
df_gan.to_csv("gan.csv", index=False, header=True)

## CTGAN

In [21]:
df_prep_ctgan = pd.read_csv("./df_prep_ctgan.csv") 

hidden_dim = 1024

params_ctgan = {"epochs" : 5000,
                "batch_size" : 1024, # % pac
                "pac" : 64, # bs 
                "embedding_dim" : 128,
                "generator_dim" : (hidden_dim, hidden_dim),
                "discriminator_dim" : (hidden_dim, hidden_dim),
                "generator_lr" : 1e-4,
                "discriminator_lr" : 1e-4,
                "verbose" : True,
                "model_path" : "./model_ctgan.pt",
                "device" : device}

ctgan = CTGAN(**params_ctgan)

In [22]:
start_time = time.time()
df_res = ctgan.sample(num_samples, df_prep_ctgan, discrete_columns=["Incident"])
ctgan_time = time.time() - start_time
print(ctgan_time)

16.15375828742981


In [23]:
df_res.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,-0.15881,-0.088226,-0.997757,0.510573,-0.957515,0.752364,-0.874993,-0.511305,2.186812,55
1,0.05899,-1.043592,-0.846184,0.499193,-1.070287,0.910246,-0.861138,-0.249163,0.261335,1
2,-0.803435,-0.399267,0.0112,-0.879717,0.901494,-0.400242,-0.9733,-0.935516,1.60223,33
3,-0.974465,0.433919,-0.500337,0.472989,1.034555,-0.262288,-0.527049,-0.712911,1.914972,4
4,-2.805794,2.72106,0.853833,-0.518954,0.860503,-0.019266,-0.844639,0.499185,0.674882,1


In [24]:
cols = ["Coord X", "Coord Y", "Duration"] 

normalizer_ctgan = load(open('normalizer_ctgan.pkl', 'rb'))

QT_inv = normalizer_ctgan.inverse_transform(df_res[cols].values)
df_ctgan = pd.DataFrame(data = QT_inv, columns = cols)
df_ctgan["Incident"] = df_res["Incident"]

In [25]:

df_ctgan["Month"] = df_res.apply(lambda row: decode_periodic(row['Month_sin'], row['Month_cos'], 12), axis=1)
df_ctgan["Hour"] = df_res.apply(lambda row: decode_periodic(row['Hour_sin'], row['Hour_cos'], 24), axis=1)
df_ctgan["Day"] = df_res.apply(lambda row: decode_periodic(row['Day_sin'], row['Day_cos'], 365), axis=1)
df_ctgan["Duration"] = df_ctgan["Duration"].astype(int)
df_ctgan.loc[df_ctgan["Duration"] < 11, "Duration"] -= df_ctgan["Duration"].min()
df_ctgan.loc[df_ctgan["Day"] > 364, "Day"] = 0
df_ctgan.loc[df_ctgan["Month"] < 1, "Month"] = 12
df_ctgan.loc[df_ctgan["Hour"] > 23, "Hour"] = 0
df_ctgan.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Hour,Day
0,571146.260161,6277021.0,217,55,10,16,312
1,572801.1876,6252033.0,91,1,10,17,315
2,562701.451516,6273217.0,162,33,6,15,116
3,556987.598065,6280039.0,185,4,10,14,106
4,500936.936341,6308300.0,109,1,4,20,93


In [26]:
df_ctgan.shape

(160401, 7)

In [27]:
df_ctgan.to_csv("ctgan.csv", index=False, header=True)

## DDPM

In [28]:
df_prep_ddpm = pd.read_csv("./df_prep_ddpm.csv") 

params_ddpm = {"epochs" : 20000,
                "batch_size" : 4096, # % pac
                "num_timesteps" : 1000, # bs 
                "layers" : 1024,
                "lr" : 0.0025,
                "dim_t" : 128,
                "weight_decay" : 0, # 1e-05,
                "model_name" : "mlp", # "mlp"
                "gaussian_loss_type" : "mse", # "mse", "kl"
                "multinomial_loss_type" : 'vb_stochastic', # 'vb_stochastic', 'vb_all'
                "parametrization" : 'x0', # 'x0', 'direct'
                "scheduler" : "cosine", # "cosine", "linear"
                "is_y_cond": True,
                "verbose" : True,
                "model_path" : "./model_ddpm.pt",
                "device" : device}

ddpm = DDPM(**params_ddpm)

In [29]:
dataset = raw_dataset_from_df(df_prep_ddpm, [], dummy = False, col = "Incident")

In [30]:
# Create synthetic data

start_time = time.time()
X_gen, y_gen = ddpm.sample(dataset, num_samples, 8192)
ddpm_time = time.time() - start_time
print(ddpm_time)

[0]
9
{'d_in': 9, 'is_y_cond': True, 'num_classes': 58, 'rtdl_params': {'d_layers': [1024, 1024], 'dropout': 0.0}, 'dim_t': 128}
mlp
label embedding Embedding(58, 128)
diffusion ready
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
33.94691610336304


In [31]:
cols = list(df_prep_ddpm.columns)
cols.remove("Incident")

df_res = pd.DataFrame(data = X_gen, columns = cols)
df_res.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration
0,0.154646,0.336658,-0.867323,-0.507067,-0.743823,-0.679941,-0.99445,-0.00863,-0.210881
1,0.983134,0.645484,0.489597,-0.871051,0.766395,-0.633158,-0.569125,-0.814567,0.451769
2,-2.933537,-2.920632,0.864872,0.507547,0.766345,0.664636,-0.864935,-0.51702,1.881066
3,0.775321,-0.182661,1.000373,0.001538,0.969153,0.249724,-0.995229,-0.007608,0.577496
4,0.588292,0.38569,0.998919,-0.002053,0.891295,0.472105,-0.883957,0.480986,-1.129748


In [32]:
cols = ["Coord X", "Coord Y", "Duration"]

df_ddpm = pd.DataFrame(columns=cols)

normalizer_ddpm = load(open('normalizer_ddpm.pkl', 'rb'))

df_ddpm[cols] = normalizer_ddpm.inverse_transform(df_res[cols].values)

In [33]:
df_ddpm.head()

Unnamed: 0,Coord X,Coord Y,Duration
0,573454.288006,6279656.0,73.0
1,577272.02567,6281028.0,98.0
2,499627.099596,6190281.0,183.0
3,576069.944102,6276306.0,105.0
4,575163.935982,6279851.0,41.0


In [34]:
df_ddpm["Day"] = df_res.apply(lambda row: decode_periodic(row['Day_sin'], row['Day_cos'], 365), axis=1)
df_ddpm["Month"] = df_res.apply(lambda row: decode_periodic(row['Month_sin'], row['Month_cos'], 12), axis=1)
df_ddpm["Hour"] = df_res.apply(lambda row: decode_periodic(row['Hour_sin'], row['Hour_cos'], 24), axis=1)
df_ddpm["Duration"] = df_ddpm["Duration"].astype(int)
# df_ddpm["densite"] = df_ddpm["densite"].astype(int)
df_ddpm["Incident"] = y_gen + 1
df_ddpm.loc[df_ddpm["Month"] < 1, "Month"] = 12
df_ddpm.loc[df_ddpm["Day"] > 364, "Day"] = 0
df_ddpm.loc[df_ddpm["Hour"] > 23, "Hour"] = 0
df_ddpm.loc[df_ddpm["Duration"] < 11, "Duration"] = 11
# df_ddpm["secteur"] = df_res["secteur"]
df_ddpm.head()

Unnamed: 0,Coord X,Coord Y,Duration,Day,Month,Hour,Incident
0,573454.288006,6279656.0,73,231,8,18,20
1,577272.02567,6281028.0,98,131,5,14,16
2,499627.099596,6190281.0,183,50,2,16,11
3,576069.944102,6276306.0,105,77,3,18,8
4,575163.935982,6279851.0,41,63,3,20,46


In [35]:
df_ddpm.shape

(160401, 7)

In [36]:
df_ddpm.to_csv("ddpm.csv", index=False, header=True)

## TINY

In [37]:
df_prep_tiny = pd.read_csv("./df_prep_tiny.csv") 

params_tiny = {"epochs" : 5000,
                "batch_size" : 4096, # % pac
                "num_timesteps" : 1000, # bs 
                "lr" : 0.0025,
                "hidden_size" : 1024,
                "hidden_layers" : 3,
                "embedding_size" : 128,
                "time_embedding" : "sinusoidal",
                "input_embedding" : "sinusoidal",
                "scale" : 2.0,
                "verbose" : True,
                "model_path" : "./model_tiny.pt",
                "device" : device}

tiny = TINY(**params_tiny)

In [38]:
# Create synthetic data

start_time = time.time()
np_tiny = tiny.sample(df_prep_tiny, num_samples)
tiny_time = time.time() - start_time
print(tiny_time)

56.26358413696289


In [39]:
df_res = pd.DataFrame(np_tiny, columns = df_prep_tiny.columns)
df_res.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,0.025609,0.765042,0.588014,0.820482,-0.004171,0.956793,-0.944957,0.456141,0.226766,0.176215
1,0.67873,0.230481,-0.144383,-0.990454,0.069694,-0.969885,-0.40151,0.946785,0.019423,-5.211408
2,-1.059536,1.308915,0.163597,-0.971429,0.384114,-0.897508,-0.385272,0.969143,-0.374526,-5.205877
3,0.705752,-0.880287,0.661969,-0.745572,0.803733,-0.532698,-0.524185,0.865052,0.994665,-0.549261
4,-0.846328,-0.845924,0.949507,-0.271846,0.99349,0.086267,-0.724325,-0.651495,0.866932,-5.217647


In [40]:
cols = ["Coord X", "Coord Y", "Duration", "Incident"]

normalizer_tiny = load(open('normalizer_tiny.pkl', 'rb'))

QT_inv = normalizer_tiny.inverse_transform(df_res[cols].values)
df_tiny = pd.DataFrame(data = QT_inv, columns = cols)
df_tiny.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident
0,572548.072144,6281871.0,89.0,8.0
1,575538.127135,6279145.0,80.0,1.0
2,554554.674245,6288340.0,66.0,1.0
3,575679.7,6260031.0,128.0,2.0
4,561728.896463,6261589.0,121.0,1.0


In [41]:
df_tiny["Month"] = df_res.apply(lambda row: decode_periodic(row['Month_sin'], row['Month_cos'], 12), axis=1)
df_tiny["Day"] = df_res.apply(lambda row: decode_periodic(row['Day_sin'], row['Day_cos'], 365), axis=1)
df_tiny["Hour"] = df_res.apply(lambda row: decode_periodic(row['Hour_sin'], row['Hour_cos'], 24), axis=1)

df_tiny["Duration"] = df_tiny["Duration"].astype(int)
df_tiny["Incident"] = df_tiny["Incident"].astype(int)

df_tiny.loc[df_tiny["Month"] < 1, "Month"] = 12
df_tiny.loc[df_tiny["Day"] > 364, "Day"] = 0
df_tiny.loc[df_tiny["Hour"] > 23, "Hour"] = 0
df_tiny.loc[df_tiny["Incident"] <= 0, "Incident"] = 1

df_tiny.loc[df_tiny["Duration"] < 11, "Duration"] = 11


df_tiny.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Day,Hour
0,572548.072144,6281871.0,89,8,1,0,20
1,575538.127135,6279145.0,80,1,6,178,22
2,554554.674245,6288340.0,66,1,6,159,23
3,575679.7,6260031.0,128,2,5,125,22
4,561728.896463,6261589.0,121,1,4,86,15


In [42]:
df_tiny.shape

(160401, 7)

In [43]:
df_tiny.to_csv("tiny.csv", index=False, header=True)