# 데이터 전처리


## 1) 좌표 변환

KATEC => GEO 

* KT 유동인구 데이터는 KATEC 좌표계를 사용하기에 일반 좌표계로 변환이 필요하다.
* 좌표변환을 위해 GeoConverter.py를 https://github.com/wan2land/python-geo-converter 의 코드를 참조했다.

In [4]:
# 필요한 라이브러리 호출
import pandas as pd
import GeoConverter

In [3]:
df_2019 = pd.read_csv("data/CHEONAN_201903_201904.csv")
df_2020 = pd.read_csv("data/CHEONAN_202003_202004.csv")

In [7]:
def convert_geo(df):
    location = []
    for i in range(len(df)):
        lists = []
        pt = GeoConverter.GeoPoint(df.iloc[i].iloc[1], df.iloc[i].iloc[2])
        output = GeoConverter.convert(GeoConverter.KATEC, GeoConverter.GEO, pt)
        lists.append(output.getY())
        lists.append(output.getX())
        location.append(lists)
    return location

In [8]:
location_2019 = convert_geo(df_2019)
location_2020 = convert_geo(df_2020)

KeyboardInterrupt: 

In [44]:
# 2019
final_df_2019 = pd.DataFrame(location_2019, columns = ["위도", "경도"])
real_final_2019 = pd.merge(df_2019, final_df_2019, how = "outer", left_index = True, right_index = True)
real_final_2019.to_csv("final_dataset_2019.csv", index=False) 
# 2020
final_df_2020 = pd.DataFrame(location_2020, columns = ["위도", "경도"])
real_final_2020 = pd.merge(df_2020, final_df_2020, how = "outer", left_index = True, right_index = True)
real_final_2020.to_csv("final_dataset_2020.csv", index=False) 

[[36.796231819970856, 127.09576965100713]]

## 2) 시간대 조절

* 천안시 유동인구 데이터가 2020년 03월부터 2020년 04월까지의 데이터이다.
* PatientInfo.csv 데이터는 특정 날짜에 발생한 확진자의 정보를 가지고 있는 데이터이다.

|patient_id|global_num|sex|birth_year|age|country|province|city|disease|infection_case|infection_order|infection_order|infected_by|contact_number|symptom_onset_date|confirmed_date|released_date|deceased_date|state|
|----------|----------|---|----------|---|-------|--------|----|-------|--------------|---------------|---------------|-----------|--------------|------------------|--------------|-------------|-------------|-----|
|1000000001|2|male|1964|50s|Korea|Seoul|Gangseo-gu|overseas|inflow|1|		||75|2020-01-22|2020-01-23|2020-02-05||released|

* Weather.csv 데이터는 특정 날짜의 날씨 정보

|code|province|date|avg_temp|min_temp|max_temp|precipitation|max_wind_speed|most_wind_direction|avg_relative_humidity|
|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|
|10000|Seoul|2016-01-01|1.2|-3.3|4|0|3.5|90|73|

**PatientInfo.csv와  Weather.csv에서 city를 천안시로 한정하고 confirmed_date를 03월에서 04로 한정해 데이터셋를 만든다.**

In [None]:
patientInfo = pd.read_csv("data/patientInfo.csv")
weather = pd.read_csv("data/Weather.csv")

In [None]:
# 날짜 충족
patientInfo = patientInfo[patientInfo['citiy'] == "Cheonan-si"]
patientInfo = patientInfo[patientInfo['confirmed_date'] >= "2020-03-01" and patientInfo['confirmed_date'] <= "2020-04-30"]


weather = weather[weather['city'] == "Chungcheongnam-do"]
weather = weather[(weather['date'] >= "2020-03-01" and weather['date'] <= "2020-04-30") or (weather['date'] >= "2019-03-01" and weather['date'] <= "2019-04-30") ]

In [None]:
patientInf.to_csv("data/patientInfo.csv")
weather.to_csv("data/Weather.csv")

# 3) 요일 변동

In [None]:
# 1) 요일 변경
import datetime
from tqdm import tqdm_notebook

def print_whichday(year, month, day) :
    r = ['월요일', '화요일', '수요일', '목요일', '금요일', '토요일', '일요일']
    aday = datetime.date(year, month, day)
    bday = aday.weekday()
    return r[bday]

final_list_2019 = []
for i in df_2019['etl_ymd']:
    day_list = []
    year = int(str(i)[:4])
    month = int(str(i)[4:6])
    day = int(str(i)[6:])
    final_list_2019.append(print_whichday(year, month, day))

final_list_2020 = []
for i in df_2020['etl_ymd']:
    day_list = []
    year = int(str(i)[:4])
    month = int(str(i)[4:6])
    day = int(str(i)[6:])
    final_list_2020.append(print_whichday(year, month, day))
    
df_2019.to_csv("D:\\2020-2\\데이터사이언스및시각화\\project\\data\\2019_with_day.csv")
df_2020.to_csv("D:\\2020-2\\데이터사이언스및시각화\\project\\data\\2020_with_day.csv")