## 방문지 종류, 환자 일련번호 이용해 나머지 정보 추출

- `data\extracted\merged_route_check_03_02.csv`에 있는 정보 이용, 환자 일련번호 이용해 감염경로 알아내고, 방문지 유형과 동 정보 이용해 방문지 위/경도 정보 추출
- 추출한 데이터는 `data\extracted\merged_route_check_04~.csv`에 저장

In [3]:
import sys
import os

from os import listdir
from os.path import isfile, join
from pathlib import Path

root = Path(os.getcwd()).parent.parent
sys.path.append(str(root))

In [23]:
import pandas as pd
import numpy as np
import re

from src.preprocess.api import GeoCoder
from src.preprocess.parser import Parser

#### 환자 일련번호 이용해 감염경로 추출

In [34]:
status_file_path = join(root, 'data', 'extracted', 'corona_status.csv')
status_data = pd.read_csv(status_file_path)

merged_file_path = join(root, 'data', 'extracted', 'merged_route_check_03_02.csv')
merged_data = pd.read_csv(merged_file_path)

names_path = join(root, 'data', 'raw', 'names.csv')
names = pd.read_csv(names_path)

In [35]:
indices_to_del = []

for index, row in merged_data.iterrows():
    region = row['from']
    patient_id = row['patient_id']
    
    # 해당 구청인 경우
    if re.search(r'^\d+$', patient_id) is not None:
        number = int(patient_id)
        region = names.loc[names['english'] == region]['korean'].tolist()
        if len(region) == 0:
            indices_to_del.append(index)
            continue
        else: region = region[0]
        
        # 구마다 구하면
        region_df = status_data.loc[status_data['거주지'].isin([region, region + '구'])]
        region_df.index = np.arange(1, len(region_df) + 1)
        
        if number > len(region_df.index):
            indices_to_del.append(index)
            continue
        
        # 접촉력으로
        reason = region_df.loc[number, '접촉력']
        merged_data.loc[index, 'reason'] = reason
        
    # 타시도 확진자인 경우
    else:
        region = re.search(r'[가-힣]*', patient_id).group()
        
        # 있는지 검사
        if len(names.loc[names['korean'] == region].index) == 0:
            indices_to_del.append(index)
            continue
        
        # 구마다 구하면
        region_df = status_data.loc[status_data['거주지'].isin([region, region + '구'])]
        region_df.index = np.arange(1, len(region_df) + 1)
        
        if number > len(region_df.index):
            indices_to_del.append(index)
            continue
        
        reason = region_df.loc[number, '접촉력']
        merged_data.loc[index, 'reason'] = reason
        
for index in indices_to_del:
    merged_data = merged_data.drop(index=index)
    
merged_data.reset_index()
merged_data

Unnamed: 0,confirmed_date,date,reason,route,from,location_address,location_region,location_type,patient_id,region,local_id,personal_info,address
0,,2020-08-22,강남구 역삼동 모임,,dongdaemoon,,휘경1동,의료기관,107,휘경1동,,,
1,,2020-08-22,강남구 역삼동 모임,,dongdaemoon,,휘경1동,약국,107,휘경1동,,,
2,,2020-08-22,강남구 역삼동 모임,,dongdaemoon,,휘경1동,마트,107,휘경1동,,,
3,,2020-08-22,강남구 사무실 관련,,dongdaemoon,,용신동,생활용품판매점,104,용신동,,,
4,,2020-08-22,강남구 사무실 관련,,dongdaemoon,,장안2동,이동통신기기판매점,103,장안2동,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,,2020-08-12,기타 확진자 접촉,,jungnang,지하철청담역–태릉입구역-봉화산역,,지하철,75,,,,
1563,,2020-08-12,기타 확진자 접촉,,jungnang,지하철청담역–태릉입구역-봉화산역,,지하철,75,,,,
1564,,2020-08-12,기타 확진자 접촉,,jungnang,버스(2234번)봉화산역-신내8단지아파트,,버스,75,,,,
1565,,2020-08-12,기타 확진자 접촉,,jungnang,버스(2234번)봉화산역-신내8단지아파트,,버스,75,,,,


필요없는 columns 제거

In [37]:
new_columns = ['date', 'reason', 'from', 'location_address', 'location_type', 'patient_id', 'region']
merged_data = merged_data[new_columns]

중간 저장

In [38]:
path = join(root, 'data', 'extracted', 'merged_route_check_04_01.csv')
merged_data.to_csv(path, encoding='utf-8-sig', index=False)

#### 대중교통으로 이동한 row 각 방문지별로 세분화
- `merged_route_check_04_01.csv` 불러와서 작업 후 `merged_route_check_04_02.csv`에 저장 
- `merged_route_check_04_02.csv`에서 수작업으로 세분화된 방문지 location_type 조정 후 저장

중간 불러오기

In [5]:
path = join(root, 'data', 'extracted', 'merged_route_check_04_01.csv')
merged_route = pd.read_csv(path)

In [6]:
merged_route['added'] = ''

In [7]:
new_rows = []
indicies_to_del = []

for index, row in merged_route.iterrows():
    if not isinstance(row['location_address'], str): continue
        
    if '→' in row['location_address']:
        routes = row['location_address'].split('→')
        
        new_row = row
        for route in routes:
            new_row['location_address'] = route
            new_row['added'] = 'True'
            new_rows.append(new_row)
        
        indicies_to_del.append(index)
        

for new_row in new_rows:
    merged_route = merged_route.append(new_row, ignore_index=True)

merged_route.drop(index=indicies_to_del)
merged_route = merged_route.reset_index(drop=True)

merged_route

Unnamed: 0,date,reason,from,location_address,location_type,patient_id,region,added
0,2020-08-22,강남구 역삼동 모임,dongdaemoon,,의료기관,107,휘경1동,
1,2020-08-22,강남구 역삼동 모임,dongdaemoon,,약국,107,휘경1동,
2,2020-08-22,강남구 역삼동 모임,dongdaemoon,,마트,107,휘경1동,
3,2020-08-22,강남구 사무실 관련,dongdaemoon,,생활용품판매점,104,용신동,
4,2020-08-22,강남구 사무실 관련,dongdaemoon,,이동통신기기판매점,103,장안2동,
...,...,...,...,...,...,...,...,...
1405,2020-08-21,동대문구 관련,dongdaemoon,청량리역,지하철,119,제기동,True
1406,2020-08-21,동대문구 관련,dongdaemoon,영주역,기차,119,제기동,True
1407,2020-08-21,동대문구 관련,dongdaemoon,영주역,기차,119,제기동,True
1408,2020-08-30,해외 접촉 추정,dongdaemoon,보건소,자전거,118,전농2동,True


중간 저장

In [8]:
path = join(root, 'data', 'extracted', 'merged_route_check_04_02.csv')
merged_route.to_csv(path, encoding='utf-8-sig', index=False)

#### API 이용해서 위경도 추출
- `merged_route_check_04_02.csv` 불러와서 작업 후 `merged_route_check_04_03.csv`에 저장

중간 불러오기

In [40]:
path = join(root, 'data', 'extracted', 'merged_route_check_04_02.csv')
merged_data = pd.read_csv(path)

위도/경도/type columns 추가

In [41]:
merged_data = merged_data.reindex(columns=merged_data.columns.tolist()+['lat', 'lng', 'type'])
merged_data.dtypes

date                 object
reason               object
from                 object
location_address     object
location_type        object
patient_id           object
region               object
lat                 float64
lng                 float64
type                float64
dtype: object

In [42]:
columns_to_change = ['location_address', 'region', 'from', 'location_type']
for column in columns_to_change:
    merged_data[column] = merged_data[column].astype('string')

merged_data.dtypes

date                 object
reason               object
from                 string
location_address     string
location_type        string
patient_id           object
region               string
lat                 float64
lng                 float64
type                float64
dtype: object

api 호출

In [47]:
geo_api = GeoCoder(key='AIzaSyCeRYD2V3a2R6AO_OcHp7Ar0b2bzhE8NUM')

for index, row in merged_data.iterrows():
    if not pd.isnull(row['lat']): continue
    
    if not pd.isnull(row['region']):
        region = row['region']
    else:
        region = row['from']
        
    if not pd.isnull(row['location_address']):
        keyword = '%s %s' % (region, row['location_address'])
    else:
        keyword = '%s %s' % (region, row['location_type'])
        
    geo_information = geo_api.get_information(keyword)
    
    if geo_information.empty: continue

    merged_data.loc[index, 'lat'] = geo_information.loc[0, 'lat']
    merged_data.loc[index, 'lng'] = geo_information.loc[0, 'lng']
    merged_data.loc[index, 'type'] = geo_information.loc[0, 'type']
    
merged_data

Unnamed: 0,date,reason,from,location_address,location_type,patient_id,region,lat,lng,type
0,2020-08-22,강남구 역삼동 모임,dongdaemoon,,의료기관,107,휘경1동,37.587895,127.065321,health
1,2020-08-22,강남구 역삼동 모임,dongdaemoon,,약국,107,휘경1동,37.590728,127.062461,health
2,2020-08-22,강남구 역삼동 모임,dongdaemoon,,마트,107,휘경1동,37.594651,127.066109,food
3,2020-08-22,강남구 사무실 관련,dongdaemoon,,생활용품판매점,104,용신동,37.577396,127.031297,home_goods_store
4,2020-08-22,강남구 사무실 관련,dongdaemoon,,이동통신기기판매점,103,장안2동,37.586383,127.043078,point_of_interest
...,...,...,...,...,...,...,...,...,...,...
1582,2020-08-25,타시도 확진자 접촉(추정),dongdaemoon,종로3가),지하철,115,휘경1동,37.570480,126.989720,street_address
1583,2020-08-25,타시도 확진자 접촉(추정),dongdaemoon,종로3가),지하철,115,휘경1동,37.570480,126.989720,street_address
1584,2020-08-25,타시도 확진자 접촉(추정),dongdaemoon,종로3가),지하철,115,휘경1동,37.570480,126.989720,street_address
1585,2020-08-25,타시도 확진자 접촉(추정),dongdaemoon,종로3가),지하철,115,휘경1동,37.570480,126.989720,street_address


In [49]:
indicies_to_del = []

for index, row in merged_data.iterrows():
    if pd.isnull(row['lat']):
        indicies_to_del.append(index)
        
for index in indicies_to_del:
    merged_data = merged_data.drop(index=index)

In [52]:
path = join(root, 'data', 'extracted', 'merged_route_check_04_03.csv')
merged_data.to_csv(path, encoding='utf-8-sig', index=False)