## [github](https://www.kaggle.com/kimjihoo/coronavirusdataset)에서 가져온 데이터 전처리

In [1]:
import sys
import os
from pathlib import Path

root = Path(os.getcwd()).parent.parent
sys.path.append(str(root))

In [2]:
import pandas as pd
import re

### PatientRoute.csv

In [3]:
patient_route = pd.read_csv(os.path.join(root, 'data', 'raw', 'github', 'PatientRoute.csv'))
patient_route.head()

Unnamed: 0,patient_id,global_num,date,province,city,type,latitude,longitude
0,1000000001,2.0,2020-01-22,Gyeonggi-do,Gimpo-si,airport,37.615246,126.715632
1,1000000001,2.0,2020-01-24,Seoul,Jung-gu,hospital,37.567241,127.005659
2,1000000002,5.0,2020-01-25,Seoul,Seongbuk-gu,etc,37.59256,127.017048
3,1000000002,5.0,2020-01-26,Seoul,Seongbuk-gu,store,37.59181,127.016822
4,1000000002,5.0,2020-01-26,Seoul,Seongdong-gu,public_transportation,37.563992,127.029534


### PatientInfo.csv

In [4]:
patient_info = pd.read_csv(os.path.join(root, 'data', 'raw', 'github', 'PatientInfo.csv'))
patient_info.head()

Unnamed: 0,patient_id,global_num,sex,birth_year,age,country,province,city,disease,infection_case,infection_order,infected_by,contact_number,symptom_onset_date,confirmed_date,released_date,deceased_date,state
0,1000000001,2.0,male,1964.0,50s,Korea,Seoul,Gangseo-gu,,overseas inflow,1.0,,75.0,2020-01-22,2020-01-23,2020-02-05,,released
1,1000000002,5.0,male,1987.0,30s,Korea,Seoul,Jungnang-gu,,overseas inflow,1.0,,31.0,,2020-01-30,2020-03-02,,released
2,1000000003,6.0,male,1964.0,50s,Korea,Seoul,Jongno-gu,,contact with patient,2.0,2002000000.0,17.0,,2020-01-30,2020-02-19,,released
3,1000000004,7.0,male,1991.0,20s,Korea,Seoul,Mapo-gu,,overseas inflow,1.0,,9.0,2020-01-26,2020-01-30,2020-02-15,,released
4,1000000005,9.0,female,1992.0,20s,Korea,Seoul,Seongbuk-gu,,contact with patient,2.0,1000000000.0,2.0,,2020-01-31,2020-02-24,,released


In [5]:
patient_info = patient_info[['patient_id', 'global_num', 'infection_case']]
patient_info.head()

Unnamed: 0,patient_id,global_num,infection_case
0,1000000001,2.0,overseas inflow
1,1000000002,5.0,overseas inflow
2,1000000003,6.0,contact with patient
3,1000000004,7.0,overseas inflow
4,1000000005,9.0,contact with patient


### MergedRoute.csv

In [6]:
merged_route = pd.merge(patient_info, patient_route)
merged_route.head()

Unnamed: 0,patient_id,global_num,infection_case,date,province,city,type,latitude,longitude
0,1000000001,2.0,overseas inflow,2020-01-22,Gyeonggi-do,Gimpo-si,airport,37.615246,126.715632
1,1000000001,2.0,overseas inflow,2020-01-24,Seoul,Jung-gu,hospital,37.567241,127.005659
2,1000000002,5.0,overseas inflow,2020-01-25,Seoul,Seongbuk-gu,etc,37.59256,127.017048
3,1000000002,5.0,overseas inflow,2020-01-26,Seoul,Seongbuk-gu,store,37.59181,127.016822
4,1000000002,5.0,overseas inflow,2020-01-26,Seoul,Seongdong-gu,public_transportation,37.563992,127.029534


In [7]:
# latitude, longitude → grid 정보
def to_grid(lat, lon):
    row = 256
    column = 256
    seoul_start_lat = 37.698098
    seoul_start_lon = 126.799791
    seoul_end_lat = 37.472494
    seoul_end_lon = 127.142928
    lat_distance = seoul_end_lat - seoul_start_lat 
    lon_distance = seoul_end_lon - seoul_start_lon

    if seoul_end_lat > lat or lat > seoul_start_lat:
        return -1, -1
    if seoul_start_lon > lon or lon > seoul_end_lon:
        return -1, -1

    lat_distance2= lat - seoul_start_lat
    lon_distance2 = lon - seoul_start_lon
    row = (lat_distance2 * 255) / lat_distance
    col = (lon_distance2 * 255) / lon_distance

    return int(row), int(col)

merged_route['row'] = ""
merged_route['col'] = ""

for i in range(len(merged_route)):
    latitude = merged_route['latitude'].values[i]
    longitude = merged_route['longitude'].values[i]
    row, col = to_grid(latitude, longitude)
    merged_route.iloc[i, merged_route.columns.get_loc('row')] = row
    merged_route.iloc[i, merged_route.columns.get_loc('col')] = col
    
merged_route.head()

Unnamed: 0,patient_id,global_num,infection_case,date,province,city,type,latitude,longitude,row,col
0,1000000001,2.0,overseas inflow,2020-01-22,Gyeonggi-do,Gimpo-si,airport,37.615246,126.715632,-1,-1
1,1000000001,2.0,overseas inflow,2020-01-24,Seoul,Jung-gu,hospital,37.567241,127.005659,147,152
2,1000000002,5.0,overseas inflow,2020-01-25,Seoul,Seongbuk-gu,etc,37.59256,127.017048,119,161
3,1000000002,5.0,overseas inflow,2020-01-26,Seoul,Seongbuk-gu,store,37.59181,127.016822,120,161
4,1000000002,5.0,overseas inflow,2020-01-26,Seoul,Seongdong-gu,public_transportation,37.563992,127.029534,151,170


In [8]:
# 서울 아닌 확진자 방문지 제거, 필요 없는 columns 제거
merged_route = merged_route.loc[merged_route['row'] != -1]
merged_route = merged_route[['patient_id', 'city', 'infection_case', 'date', 'type', 'latitude', 'longitude', 'row', 'col']]
merged_route.head()

Unnamed: 0,patient_id,city,infection_case,date,type,latitude,longitude,row,col
1,1000000001,Jung-gu,overseas inflow,2020-01-24,hospital,37.567241,127.005659,147,152
2,1000000002,Seongbuk-gu,overseas inflow,2020-01-25,etc,37.59256,127.017048,119,161
3,1000000002,Seongbuk-gu,overseas inflow,2020-01-26,store,37.59181,127.016822,120,161
4,1000000002,Seongdong-gu,overseas inflow,2020-01-26,public_transportation,37.563992,127.029534,151,170
5,1000000002,Seongbuk-gu,overseas inflow,2020-01-26,public_transportation,37.59033,127.015221,121,160


In [9]:
# city 형식에 맞게 변경
indices_to_del = []

for index, row in merged_route.iterrows():
    city = row['city']
    
    new_city = re.search(r'.*(?=-gu)', city)
    
    if new_city is None:
        indices_to_del.append(index)
        continue
        
    new_city = new_city.group().lower()
    merged_route.loc[index, 'city'] = new_city
    
for index in indices_to_del:
    merged_route = merged_route.drop(index=index)
    
merged_route = merged_route.reset_index(drop=True)
merged_route

Unnamed: 0,patient_id,city,infection_case,date,type,latitude,longitude,row,col
0,1000000001,jung,overseas inflow,2020-01-24,hospital,37.567241,127.005659,147,152
1,1000000002,seongbuk,overseas inflow,2020-01-25,etc,37.592560,127.017048,119,161
2,1000000002,seongbuk,overseas inflow,2020-01-26,store,37.591810,127.016822,120,161
3,1000000002,seongdong,overseas inflow,2020-01-26,public_transportation,37.563992,127.029534,151,170
4,1000000002,seongbuk,overseas inflow,2020-01-26,public_transportation,37.590330,127.015221,121,160
...,...,...,...,...,...,...,...,...,...
2502,6004000023,yeongdeungpo,etc,2020-02-24,hospital,37.499847,126.911006,224,82
2503,6004000025,jung,etc,2020-02-21,etc,37.566535,126.977969,148,132
2504,6011000010,mapo,contact with patient,2020-02-19,store,37.551936,126.921668,165,90
2505,6023000011,dongjak,,2020-02-21,hospital,37.498990,126.937719,225,102


In [10]:
# 파일로 저장
path = os.path.join(root, 'data', 'extracted', 'MergedRoute.csv')
merged_route.to_csv(path, encoding='utf-8-sig', index=False)