In [2]:
from pyproj import CRS, Transformer
import pandas as pd
import numpy as np
import csv

Set up CRS transformer.

In [3]:
crs = CRS.from_epsg(5179)

Transformation to meters (Northing and Easting)

In [4]:
crs

<Projected CRS: EPSG:5179>
Name: Korea 2000 / Unified CS
Axis Info [cartesian]:
- X[north]: Northing (metre)
- Y[east]: Easting (metre)
Area of Use:
- name: Republic of Korea (South Korea) - onshore and offshore.
- bounds: (122.71, 28.6, 134.28, 40.27)
Coordinate Operation:
- name: Korea Unified Belt
- method: Transverse Mercator
Datum: Geocentric datum of Korea
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

Transformation to Lat/Long

In [5]:
crs.geodetic_crs

<Geographic 2D CRS: EPSG:4737>
Name: Korea 2000
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: Republic of Korea (South Korea) - onshore and offshore.
- bounds: (122.71, 28.6, 134.28, 40.27)
Datum: Geocentric datum of Korea
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

In [6]:
proj = Transformer.from_crs(crs.geodetic_crs, crs)

In [7]:
proj_inv = Transformer.from_crs(crs, crs.geodetic_crs)

Import patient route data and test the transformation.

In [26]:
routes = pd.read_csv("../Data/PatientRoute.csv")

In [31]:
trans_lats, trans_longs = proj.transform(routes.latitude, routes.longitude)

In [32]:
lats, longs = proj_inv.transform(trans_lats, trans_longs)

In [35]:
lats

array([37.6152464, 37.5672412, 37.5925601, ..., 35.3369444, 35.3357574,
       35.3369444])

In [36]:
longs

array([126.7156325, 127.0056589, 127.0170483, ..., 129.0263889,
       129.0250031, 129.0263889])

In [34]:
routes

Unnamed: 0,patient_id,global_num,date,province,city,type,latitude,longitude
0,1000000001,2.0,1/22/2020,Gyeonggi-do,Gimpo-si,airport,37.615246,126.715632
1,1000000001,2.0,1/24/2020,Seoul,Jung-gu,hospital,37.567241,127.005659
2,1000000002,5.0,1/25/2020,Seoul,Seongbuk-gu,etc,37.592560,127.017048
3,1000000002,5.0,1/26/2020,Seoul,Seongbuk-gu,store,37.591810,127.016822
4,1000000002,5.0,1/26/2020,Seoul,Seongdong-gu,public_transportation,37.563992,127.029534
...,...,...,...,...,...,...,...,...
8087,6100000090,,3/24/2020,Seoul,Gangseo-gu,airport,37.558655,126.794474
8088,6100000090,,3/24/2020,Busan,Gangseo-gu,airport,35.173220,128.946459
8089,6100000090,,3/25/2020,Gyeongsangnam-do,Yangsan-si,store,35.336944,129.026389
8090,6100000090,,3/25/2020,Gyeongsangnam-do,Yangsan-si,hospital,35.335757,129.025003


The test results look correct.

***

## Back transform the cleaned data.

In [8]:
orig = pd.read_csv("../Data/cleaned_data.csv")

Need to reverse the standardization, then backtransform into latitude and longitude.

In [9]:
stan_stats = pd.read_csv("../Data/standardization_stats.csv")

In [10]:
stan_stats

Unnamed: 0,means,stds
0,1862904.0,100990.921761
1,1012072.0,80816.765932


In [69]:
lat = orig.latitude * stan_stats.stds[0] + stan_stats.means[0]

In [70]:
long = orig.longitude * stan_stats.stds[1] + stan_stats.means[1]

In [71]:
lat, long = proj_inv.transform(lat, long)

In [72]:
orig[['latitude', 'longitude']] = pd.DataFrame([lat, long]).T

In [73]:
orig

Unnamed: 0,latitude,longitude,sex,age,state
0,37.615246,126.715632,male,50s,0
1,37.567241,127.005659,male,50s,0
2,37.592560,127.017048,male,30s,0
3,37.591810,127.016822,male,30s,0
4,37.563992,127.029534,male,30s,0
...,...,...,...,...,...
6768,37.558655,126.794474,male,40s,0
6769,35.173220,128.946459,male,40s,0
6770,35.336944,129.026389,male,40s,0
6771,35.335757,129.025003,male,40s,0


In [11]:
def backtransform(transformed_data, stan_stats, geo_transformer):
    lat = transformed_data.latitude * stan_stats.stds[0] + stan_stats.means[0]
    long = transformed_data.longitude * stan_stats.stds[1] + stan_stats.means[1]
    lat, long = geo_transformer.transform(lat, long)
    transformed_data[['latitude', 'longitude']] = pd.DataFrame([lat, long]).T
    return transformed_data

In [76]:
orig_data = backtransform(orig, stan_stats, proj_inv)

In [77]:
orig_data.to_csv("../Data/original_unstandardized.csv", index=False)

***

Backtransform a synthetic data set.

In [12]:
orig = pd.read_csv("../Data/cleaned_data.csv")

In [None]:
synth = pd.read_csv("../Data/")