# Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
from math import sqrt
from pickle import load

from sklearn.preprocessing import MinMaxScaler, QuantileTransformer

from data import *
from generation import *

from tvae import TVAE
from gan import GAN
from ctgan import CTGAN
from ddpm import DDPM
from tiny import TINY

import time

In [2]:
import gc
gc.collect()

40

In [3]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device is", device)

device is cuda


In [4]:
os.chdir('./Data')

df_raw = pd.read_csv("./raw_data.csv") 
df_raw.head()

Unnamed: 0,Coord X,Coord Y,Month,Day,Hour,Duration,Incident
0,573603.0,6280852.0,1,0,0,54.0,4
1,558522.0,6263928.0,1,0,0,32.0,18
2,574303.94,6279553.0,1,0,0,25.0,7
3,571710.7,6305442.5,1,0,0,167.0,27
4,569896.0,6265672.0,1,0,0,41.0,1


# Sampling

In [5]:
num_samples = len(df_raw) * 3 # at least x3 for resampling

## TVAE

In [6]:
df_prep_tvae = pd.read_csv("./df_prep_tvae.csv") 

params_tvae = {"epochs" : 5000,
                "batch_size" : 1024, 
                "embedding_dim" : 64,
                "compress_dims":(256,512),
                "decompress_dims":(256,512),
                "lr":1e-4,
                "l2scale":1e-5, 
                "loss_factor":2,
                "verbose" : True,
                "model_path" : "./model_tvae.pt",
                "device" : device}

tvae = TVAE(**params_tvae)

In [7]:
# Create synthetic data
start_time = time.time()
data = tvae.sample(df_prep_tvae, num_samples, discrete_columns=["Incident"])
tvae_time = time.time() - start_time
print(tvae_time)

36.90308427810669


In [8]:
df_res = pd.DataFrame(data, columns = df_prep_tvae.columns)

In [9]:
cols = ["Coord X", "Coord Y", "Duration"] 

normalizer_tvae = load(open('normalizer_tvae.pkl', 'rb'))

QT_inv = normalizer_tvae.inverse_transform(df_res[cols].values)
df_tvae = pd.DataFrame(data = QT_inv, columns = cols)
df_tvae["Incident"] = df_res["Incident"]

In [10]:
df_tvae.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident
0,574828.172208,6276488.0,73.629071,1
1,576104.729337,6263678.0,122.406542,1
2,573592.348023,6279383.0,57.101437,1
3,575324.569656,6280996.0,59.451604,1
4,574474.47285,6277470.0,45.084973,1


In [11]:
# df_res = reverse_df_minmax(df_to_process, cols_to_unminmax, df_raw)
df_tvae["Month"] = df_res.apply(lambda row: decode_periodic(row['Month_sin'], row['Month_cos'], 12), axis=1)
df_tvae["Day"] = df_res.apply(lambda row: decode_periodic(row['Day_sin'], row['Day_cos'], 365), axis=1)
df_tvae["Hour"] = df_res.apply(lambda row: decode_periodic(row['Hour_sin'], row['Hour_cos'], 24), axis=1)
df_tvae["Duration"] = df_tvae["Duration"].astype(int)
df_tvae["Incident"] = df_tvae["Incident"].astype(int)

df_tvae.loc[df_tvae["Month"] < 1, "Month"] = 12
df_tvae.loc[df_tvae["Day"] > 364, "Day"] = 0
df_tvae.loc[df_tvae["Hour"] > 23, "Hour"] = 0
df_tvae.loc[df_tvae["Incident"] <= 0, "Incident"] = 1
df_tvae.loc[df_tvae["Incident"] > 58, "Incident"] = 58
df_tvae.loc[df_tvae["Duration"] < 11, "Duration"] = 11

df_tvae.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Day,Hour
0,574828.172208,6276488.0,73,1,1,26,21
1,576104.729337,6263678.0,122,1,9,291,15
2,573592.348023,6279383.0,57,1,10,295,21
3,575324.569656,6280996.0,59,1,4,123,15
4,574474.47285,6277470.0,45,1,1,39,18


In [12]:
df_tvae.shape

(160401, 7)

In [13]:
df_tvae.to_csv("tvae.csv", index=False, header=True)

## GAN

In [14]:
df_prep_gan = pd.read_csv("./df_prep_gan.csv") 

params_gan = {"epochs" : 5000,
                "batch_size" : 1024, 
                "embedding_dim" : 128,
                "hidden_dim" : 1024,
                "generator_lr" : 1e-4,
                "discriminator_lr" : 1e-4,
                "pac" : 64,
                "verbose" : True,
                "model_path" : "./model_gan.pt",
                "device" : device}

gan = GAN(**params_gan)

In [15]:
# Create synthetic data
start_time = time.time()
data = gan.sample(df_prep_gan, num_samples).cpu().detach().numpy()
gan_time = time.time() - start_time
print(gan_time)

0.2957639694213867


In [16]:
df_res = pd.DataFrame(data, columns = df_prep_gan.columns)
cols = ["Coord X", "Coord Y", "Duration", "Incident"] 

normalizer_gan = load(open('normalizer_gan.pkl', 'rb'))

QT_inv = normalizer_gan.inverse_transform(df_res[cols].values)
df_gan = pd.DataFrame(data = QT_inv, columns = cols)

In [17]:
df_gan.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident
0,536831.0625,6222450.0,165.234741,8.758404
1,576483.375,6277401.5,151.515472,54.296017
2,565925.9375,6285182.0,-22.262941,15.623742
3,585413.25,6268545.5,100.179436,7.544322
4,586029.5,6263835.0,108.201622,3.263763


In [18]:
# df_res = reverse_df_minmax(df_to_process, cols_to_unminmax, df_raw)
df_gan["Month"] = df_res.apply(lambda row: decode_periodic(row['Month_sin'], row['Month_cos'], 12), axis=1)
df_gan["Day"] = df_res.apply(lambda row: decode_periodic(row['Day_sin'], row['Day_cos'], 365), axis=1)
df_gan["Hour"] = df_res.apply(lambda row: decode_periodic(row['Hour_sin'], row['Hour_cos'], 24), axis=1)
df_gan["Duration"] = df_gan["Duration"].astype(int)
df_gan["Incident"] = df_gan["Incident"].astype(int)

df_gan.loc[df_gan["Month"] < 1, "Month"] = 12
df_gan.loc[df_gan["Day"] > 364, "Day"] = 0
df_gan.loc[df_gan["Hour"] > 23, "Hour"] = 0
df_gan.loc[df_gan["Incident"] <= 0, "Incident"] = 1
df_gan.loc[df_gan["Incident"] > 58, "Incident"] = 58
df_gan.loc[df_gan["Duration"] < 11, "Duration"] = 11

df_gan.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Day,Hour
0,536831.0625,6222450.0,165,8,9,266,15
1,576483.375,6277401.5,151,54,4,107,12
2,565925.9375,6285182.0,11,15,7,199,16
3,585413.25,6268545.5,100,7,7,186,16
4,586029.5,6263835.0,108,3,7,192,16


In [19]:
df_gan.shape

(160401, 7)

In [20]:
df_gan.to_csv("gan.csv", index=False, header=True)

## CTGAN

In [21]:
df_prep_ctgan = pd.read_csv("./df_prep_ctgan.csv") 

hidden_dim = 1024

params_ctgan = {"epochs" : 5000,
                "batch_size" : 1024, # % pac
                "pac" : 64, # bs 
                "embedding_dim" : 128,
                "generator_dim" : (hidden_dim, hidden_dim),
                "discriminator_dim" : (hidden_dim, hidden_dim),
                "generator_lr" : 1e-4,
                "discriminator_lr" : 1e-4,
                "verbose" : True,
                "model_path" : "./model_ctgan.pt",
                "device" : device}

ctgan = CTGAN(**params_ctgan)

In [22]:
start_time = time.time()
df_res = ctgan.sample(num_samples, df_prep_ctgan, discrete_columns=["Incident"])
ctgan_time = time.time() - start_time
print(ctgan_time)

34.022122859954834


In [23]:
df_res.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,-0.15787,-1.283922,-0.898838,0.983862,0.158936,0.185772,0.017411,-0.893928,-0.17939,7
1,-1.23002,-3.648718,-1.001573,0.882451,-1.011614,0.622946,-1.024823,-0.239799,3.142198,1
2,0.08203,-2.892666,-1.011121,0.503664,-0.341834,0.961834,0.279856,-0.697425,-1.112262,5
3,0.382181,-0.340718,0.88079,0.860822,1.038474,0.454142,-0.50563,-0.996606,1.951655,1
4,0.829365,0.734491,-0.000864,-0.857602,0.754552,0.829645,-0.010916,-0.014178,-1.580664,11


In [24]:
cols = ["Coord X", "Coord Y", "Duration"] 

normalizer_ctgan = load(open('normalizer_ctgan.pkl', 'rb'))

QT_inv = normalizer_ctgan.inverse_transform(df_res[cols].values)
df_ctgan = pd.DataFrame(data = QT_inv, columns = cols)
df_ctgan["Incident"] = df_res["Incident"]

In [25]:

df_ctgan["Month"] = df_res.apply(lambda row: decode_periodic(row['Month_sin'], row['Month_cos'], 12), axis=1)
df_ctgan["Hour"] = df_res.apply(lambda row: decode_periodic(row['Hour_sin'], row['Hour_cos'], 24), axis=1)
df_ctgan["Day"] = df_res.apply(lambda row: decode_periodic(row['Day_sin'], row['Day_cos'], 365), axis=1)
df_ctgan["Duration"] = df_ctgan["Duration"].astype(int)
df_ctgan.loc[df_ctgan["Duration"] < 11, "Duration"] -= df_ctgan["Duration"].min()
df_ctgan.loc[df_ctgan["Day"] > 364, "Day"] = 0
df_ctgan.loc[df_ctgan["Month"] < 1, "Month"] = 12
df_ctgan.loc[df_ctgan["Hour"] > 23, "Hour"] = 0
df_ctgan.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Hour,Day
0,571160.257765,6239837.0,73,7,11,12,41
1,544885.132935,6184630.0,820,1,10,17,306
2,573040.671469,6190357.0,42,5,10,11,345
3,574441.862495,6274431.0,190,1,2,14,67
4,576428.020805,6281663.0,32,11,6,15,43


In [26]:
df_ctgan.shape

(160401, 7)

In [27]:
df_ctgan.to_csv("ctgan.csv", index=False, header=True)

## DDPM

In [28]:
df_prep_ddpm = pd.read_csv("./df_prep_ddpm.csv") 

params_ddpm = {"epochs" : 20000,
                "batch_size" : 4096, # % pac
                "num_timesteps" : 1000, # bs 
                "layers" : 1024,
                "lr" : 0.0025,
                "dim_t" : 128,
                "weight_decay" : 0, # 1e-05,
                "model_name" : "mlp", # "mlp"
                "gaussian_loss_type" : "mse", # "mse", "kl"
                "multinomial_loss_type" : 'vb_stochastic', # 'vb_stochastic', 'vb_all'
                "parametrization" : 'x0', # 'x0', 'direct'
                "scheduler" : "cosine", # "cosine", "linear"
                "is_y_cond": True,
                "verbose" : True,
                "model_path" : "./model_ddpm.pt",
                "device" : device}

ddpm = DDPM(**params_ddpm)

In [29]:
dataset = raw_dataset_from_df(df_prep_ddpm, [], dummy = False, col = "Incident")

In [30]:
# Create synthetic data

start_time = time.time()
X_gen, y_gen = ddpm.sample(dataset, num_samples, 8192)
ddpm_time = time.time() - start_time
print(ddpm_time)

[0]
9
{'d_in': 9, 'is_y_cond': True, 'num_classes': 58, 'rtdl_params': {'d_layers': [1024, 1024], 'dropout': 0.0}, 'dim_t': 128}
mlp
label embedding Embedding(58, 128)
diffusion ready
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
163.08762979507446


In [31]:
cols = list(df_prep_ddpm.columns)
cols.remove("Incident")

df_res = pd.DataFrame(data = X_gen, columns = cols)
df_res.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration
0,1.534313,-0.695147,-0.000437,-0.989545,0.361954,-0.932208,0.589233,-0.800419,0.49158
1,0.367825,0.242242,-0.866449,0.50455,-0.902264,0.445285,0.882317,0.479807,-0.927896
2,-1.237535,-0.861632,0.50397,0.864535,0.398802,0.925277,0.471412,-0.868868,1.121619
3,0.081003,1.105807,0.995095,0.006572,0.920214,0.36347,0.883839,-0.482168,0.634276
4,-1.159341,-0.375547,0.503492,-0.856357,0.675089,-0.717124,-0.671992,0.742278,0.871429


In [32]:
cols = ["Coord X", "Coord Y", "Duration"]

df_ddpm = pd.DataFrame(columns=cols)

normalizer_ddpm = load(open('normalizer_ddpm.pkl', 'rb'))

df_ddpm[cols] = normalizer_ddpm.inverse_transform(df_res[cols].values)

In [33]:
df_ddpm.head()

Unnamed: 0,Coord X,Coord Y,Duration
0,584303.40898,6265068.0,100.0
1,574401.163785,6279183.0,48.0
2,544504.557899,6260727.0,135.0
3,573033.476225,6285422.0,107.0
4,550028.964308,6273734.0,121.0


In [34]:
df_ddpm["Day"] = df_res.apply(lambda row: decode_periodic(row['Day_sin'], row['Day_cos'], 365), axis=1)
df_ddpm["Month"] = df_res.apply(lambda row: decode_periodic(row['Month_sin'], row['Month_cos'], 12), axis=1)
df_ddpm["Hour"] = df_res.apply(lambda row: decode_periodic(row['Hour_sin'], row['Hour_cos'], 24), axis=1)
df_ddpm["Duration"] = df_ddpm["Duration"].astype(int)
# df_ddpm["densite"] = df_ddpm["densite"].astype(int)
df_ddpm["Incident"] = y_gen + 1
df_ddpm.loc[df_ddpm["Month"] < 1, "Month"] = 12
df_ddpm.loc[df_ddpm["Day"] > 364, "Day"] = 0
df_ddpm.loc[df_ddpm["Hour"] > 23, "Hour"] = 0
df_ddpm.loc[df_ddpm["Duration"] < 11, "Duration"] = 11
# df_ddpm["secteur"] = df_res["secteur"]
df_ddpm.head()

Unnamed: 0,Coord X,Coord Y,Duration,Day,Month,Hour,Incident
0,584303.40898,6265068.0,100,161,6,10,1
1,574401.163785,6279183.0,48,300,10,4,7
2,544504.557899,6260727.0,135,24,1,10,3
3,573033.476225,6285422.0,107,69,3,8,9
4,550028.964308,6273734.0,121,139,5,21,16


In [35]:
df_ddpm.shape

(160401, 7)

In [36]:
df_ddpm.to_csv("ddpm.csv", index=False, header=True)

## TINY

In [37]:
df_prep_tiny = pd.read_csv("./df_prep_tiny.csv") 

params_tiny = {"epochs" : 5000,
                "batch_size" : 4096, # % pac
                "num_timesteps" : 1000, # bs 
                "lr" : 0.0025,
                "hidden_size" : 1024,
                "hidden_layers" : 3,
                "embedding_size" : 128,
                "time_embedding" : "sinusoidal",
                "input_embedding" : "sinusoidal",
                "scale" : 2.0,
                "verbose" : True,
                "model_path" : "./model_tiny.pt",
                "device" : device}

tiny = TINY(**params_tiny)

In [38]:
# Create synthetic data

start_time = time.time()
np_tiny = tiny.sample(df_prep_tiny, num_samples)
tiny_time = time.time() - start_time
print(tiny_time)

372.6878125667572


In [39]:
df_res = pd.DataFrame(np_tiny, columns = df_prep_tiny.columns)
df_res.head()

Unnamed: 0,Coord X,Coord Y,Month_sin,Month_cos,Day_sin,Day_cos,Hour_sin,Hour_cos,Duration,Incident
0,1.38806,-0.865267,-0.93706,0.303339,-0.967518,0.282121,-0.68939,-0.743374,0.557614,-0.096988
1,0.790685,-0.04251,0.33934,-0.948256,0.559051,-0.803786,0.128456,0.99406,-1.783188,0.258512
2,0.16254,0.464648,0.456357,0.880662,0.380987,1.031024,0.624572,0.828503,-0.55546,0.714565
3,0.115994,-0.295543,-0.044149,-0.989161,0.390758,-0.887945,0.919854,0.347915,0.858128,0.514191
4,-0.105984,0.870739,0.494334,0.838266,0.18617,1.045652,0.000895,0.985242,0.169729,-5.232302


In [40]:
cols = ["Coord X", "Coord Y", "Duration", "Incident"]

normalizer_tiny = load(open('normalizer_tiny.pkl', 'rb'))

QT_inv = normalizer_tiny.inverse_transform(df_res[cols].values)
df_tiny = pd.DataFrame(data = QT_inv, columns = cols)
df_tiny.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident
0,581540.401351,6260626.0,105.0,5.0
1,576164.28905,6277264.0,27.0,9.0
2,573494.0,6280154.0,59.0,18.0
3,573232.076536,6275144.0,119.747325,13.0
4,571539.524912,6282859.0,87.0,1.0


In [41]:
df_tiny["Month"] = df_res.apply(lambda row: decode_periodic(row['Month_sin'], row['Month_cos'], 12), axis=1)
df_tiny["Day"] = df_res.apply(lambda row: decode_periodic(row['Day_sin'], row['Day_cos'], 365), axis=1)
df_tiny["Hour"] = df_res.apply(lambda row: decode_periodic(row['Hour_sin'], row['Hour_cos'], 24), axis=1)

df_tiny["Duration"] = df_tiny["Duration"].astype(int)
df_tiny["Incident"] = df_tiny["Incident"].astype(int)

df_tiny.loc[df_tiny["Month"] < 1, "Month"] = 12
df_tiny.loc[df_tiny["Day"] > 364, "Day"] = 0
df_tiny.loc[df_tiny["Hour"] > 23, "Hour"] = 0
df_tiny.loc[df_tiny["Incident"] <= 0, "Incident"] = 1

df_tiny.loc[df_tiny["Duration"] < 11, "Duration"] = 11


df_tiny.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Day,Hour
0,581540.401351,6260626.0,105,5,10,290,15
1,576164.28905,6277264.0,27,9,5,147,0
2,573494.0,6280154.0,59,18,1,21,2
3,573232.076536,6275144.0,119,13,6,158,5
4,571539.524912,6282859.0,87,1,1,10,0


In [42]:
df_tiny.shape

(160401, 7)

In [43]:
df_tiny.to_csv("tiny.csv", index=False, header=True)