# Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os
from math import sqrt

import geopandas

from data import *
from generation import *

In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device is", device)

device is cuda


In [3]:
os.chdir('./Data')

In [4]:
filename = "pdd.geojson"
gdf_areas = geopandas.read_file(filename)
zones = union_iris(gdf_areas, "cis1" , True, ['MONTGISCARD', 'AUSSONNE', 'TOULOUSE - ATLANTA', 'TOULOUSE - CARSALADE', 'TOULOUSE - DELRIEU'])

# Resampling

In [5]:
df_raw = pd.read_csv("./raw_data.csv") 
df_tvae = pd.read_csv("./tvae.csv") 
df_gan = pd.read_csv("./gan.csv") 
df_ctgan = pd.read_csv("./ctgan.csv") 
df_ddpm = pd.read_csv("./ddpm.csv") 
df_tiny = pd.read_csv("./tiny.csv") 

In [6]:
print(df_raw.shape, df_tvae.shape, df_gan.shape, df_ctgan.shape, df_ddpm.shape, df_tiny.shape)

(53467, 7) (160401, 7) (160401, 7) (160401, 7) (160401, 7) (160401, 7)


In [7]:
gdf_real = geopandas.GeoDataFrame(df_raw, geometry=geopandas.points_from_xy(df_raw['Coord X'], df_raw['Coord Y']), crs="2154")
df_raw['area_name'] = gdf_real["geometry"].apply(get_point_in_area, args=(zones,))

In [8]:
var = 0.02 # variability tolerated in the sampled data
df_new_samples = create_df_new_samples(df_raw, var) # compute the number of new samples for each sector
df_new_samples.delta.sum() # should return 0 when number of new samples = number of old samples

0

In [9]:
df_new_samples.head()

Unnamed: 0,area_name,count,new_samples,perc.,delta
0,TOULOUSE - VION,8976,8969,-0.077986,-7
1,TOULOUSE - LOUGNON,8790,8846,0.637088,56
2,COLOMIERS,6390,6332,-0.907668,-58
3,RAMONVILLE - BUCHENS,5016,5087,1.41547,71
4,MURET - MASSAT,4086,4021,-1.590798,-65


## TVAE

In [10]:
gdf_fake = geopandas.GeoDataFrame(df_tvae, geometry=geopandas.points_from_xy(df_tvae['Coord X'], df_tvae['Coord Y']), crs="2154")
df_tvae['area_name'] = gdf_fake["geometry"].apply(get_point_in_area, args=(zones,))

In [11]:
df_new = new_df_sample(df_new_samples, df_tvae)
df_new.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Day,Hour,area_name
0,573917.611455,6277998.0,103,1,6,152,13,TOULOUSE - VION
1,572993.952984,6280714.0,90,1,3,300,15,TOULOUSE - VION
2,573795.353245,6280802.0,33,1,7,211,15,TOULOUSE - VION
3,573401.044721,6273532.0,81,1,4,135,21,TOULOUSE - VION
4,573091.453597,6281721.0,37,1,4,104,22,TOULOUSE - VION


In [12]:
df_new.to_csv("resample_tvae.csv", index=False, header=True)

## GAN

In [13]:
gdf_fake = geopandas.GeoDataFrame(df_tvae, geometry=geopandas.points_from_xy(df_gan['Coord X'], df_gan['Coord Y']), crs="2154")
df_gan['area_name'] = gdf_fake["geometry"].apply(get_point_in_area, args=(zones,))

In [14]:
df_new = new_df_sample(df_new_samples, df_gan)
df_new.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Day,Hour,area_name
0,573429.1,6274373.0,118,16,8,231,11,TOULOUSE - VION
1,571266.6,6280046.5,135,19,12,335,17,TOULOUSE - VION
2,571044.7,6276829.5,11,1,5,157,14,TOULOUSE - VION
3,573795.4,6285673.5,51,10,10,291,17,TOULOUSE - VION
4,571516.75,6278247.5,133,2,5,134,10,TOULOUSE - VION


In [15]:
df_new.to_csv("resample_gan.csv", index=False, header=True)

## CTGAN

In [16]:
gdf_fake = geopandas.GeoDataFrame(df_ctgan, geometry=geopandas.points_from_xy(df_ctgan['Coord X'], df_ctgan['Coord Y']), crs="2154")
df_ctgan['area_name'] = gdf_fake["geometry"].apply(get_point_in_area, args=(zones,))

In [17]:
df_new = new_df_sample(df_new_samples, df_ctgan)
df_new.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Hour,Day,area_name
0,572452.517724,6278690.0,29,3,1,0,42,TOULOUSE - VION
1,573494.0,6279253.0,105,24,10,22,303,TOULOUSE - VION
2,572608.781368,6279877.0,27,6,3,0,64,TOULOUSE - VION
3,571912.755844,6282510.0,96,12,11,2,309,TOULOUSE - VION
4,573408.434436,6280631.0,36,3,12,18,305,TOULOUSE - VION


In [18]:
df_new.to_csv("resample_ctgan.csv", index=False, header=True)

## DDPM

In [19]:
gdf_fake = geopandas.GeoDataFrame(df_ddpm, geometry=geopandas.points_from_xy(df_ddpm['Coord X'], df_ddpm['Coord Y']), crs="2154")
df_ddpm['area_name'] = gdf_fake["geometry"].apply(get_point_in_area, args=(zones,))

In [20]:
df_new = new_df_sample(df_new_samples, df_ddpm)
df_new.head()

Unnamed: 0,Coord X,Coord Y,Duration,Day,Month,Hour,Incident,area_name
0,572709.368182,6277876.0,38,52,2,15,10,TOULOUSE - VION
1,572545.550167,6276899.0,38,46,2,10,2,TOULOUSE - VION
2,571918.338052,6274543.0,34,132,5,18,24,TOULOUSE - VION
3,571459.136565,6280166.0,75,202,7,19,4,TOULOUSE - VION
4,572053.806689,6281747.0,59,192,7,12,31,TOULOUSE - VION


In [21]:
df_new.to_csv("resample_ddpm.csv", index=False, header=True)

In [22]:
# get the number of iterations needed to reach the desired sample quantity

df_new, cpt_samples = get_minimal_sampling(df_new_samples, df_ddpm)

In [23]:
cpt_samples

100968

In [24]:
df_new.head()

Unnamed: 0,Coord X,Coord Y,Duration,Day,Month,Hour,Incident,area_name
0,573454.288006,6279656.0,73,231,8,18,20,TOULOUSE - VION
1,577272.02567,6281028.0,98,131,5,14,16,TOULOUSE - LOUGNON
2,499627.099596,6190281.0,183,50,2,16,11,BAGNERES DE LUCHON
3,576069.944102,6276306.0,105,77,3,18,8,TOULOUSE - LOUGNON
4,575163.935982,6279851.0,41,63,3,20,46,TOULOUSE - LOUGNON


## TINY

In [25]:
gdf_fake = geopandas.GeoDataFrame(df_tiny, geometry=geopandas.points_from_xy(df_tiny['Coord X'], df_tiny['Coord Y']), crs="2154")
df_tiny['area_name'] = gdf_fake["geometry"].apply(get_point_in_area, args=(zones,))

In [26]:
df_new = new_df_sample(df_new_samples, df_tiny)
df_new.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Day,Hour,area_name
0,571300.600098,6280304.0,41,9,10,270,12,TOULOUSE - VION
1,571944.633299,6280653.0,57,1,1,3,12,TOULOUSE - VION
2,571932.185942,6282404.0,59,6,5,151,18,TOULOUSE - VION
3,572989.57683,6278548.0,57,5,1,7,19,TOULOUSE - VION
4,572258.189085,6275734.0,71,7,5,143,19,TOULOUSE - VION


In [27]:
df_new.to_csv("resample_tiny.csv", index=False, header=True)

In [28]:
# get the number of iterations needed to reach the desired sample quantity

df_new, cpt_samples = get_minimal_sampling(df_new_samples, df_tiny)

In [29]:
cpt_samples

112304

In [30]:
df_new.head()

Unnamed: 0,Coord X,Coord Y,Duration,Incident,Month,Day,Hour,area_name
0,572548.072144,6281871.0,89,8,1,0,20,TOULOUSE - VION
1,575538.127135,6279145.0,80,1,6,178,22,TOULOUSE - LOUGNON
2,554554.674245,6288340.0,66,1,6,159,23,GRENADE
3,575679.7,6260031.0,128,2,5,125,22,AUTERIVE
4,561728.896463,6261589.0,121,1,4,86,15,MURET - MASSAT
