1. 일단 데이터를 날짜별로 자른다.
2. 모든 피처를 다 만드는 경우
  1) 나이대 10개
  2) 성별 2개
  3) 감염 원인 11개
  4) 방문 요일 7개
     총 30개 배열

In [1]:
import pandas as pd
import scipy.stats as st
from datetime import datetime
from datetime import timedelta
import math
import numpy as np
from PIL import Image
from scipy import stats
import copy

### 1. 루트 생성

In [2]:
patient_route = pd.read_csv('covid19/PatientRoute.csv')
patient_info = pd.read_csv('covid19/PatientInfo.csv')
patient_info = patient_info[["patient_id","global_num","sex","age","infection_case"]]
club_route = pd.read_csv('club.csv')

In [3]:
merge_route = pd.merge(patient_info, patient_route)
merge_route = merge_route[merge_route["province"]=="Seoul"]
merge_route = merge_route[pd.notnull(merge_route['sex'])]
merge_route = merge_route[pd.notnull(merge_route['age'])]
club_route = club_route[pd.notnull(club_route['latitude'])]
club_route = club_route[pd.notnull(club_route['longitude'])]
club_route = club_route[club_route["date"]!="2020-04-20"]

In [4]:
causes = merge_route['infection_case']
causes = causes.drop_duplicates(keep='last')
causes = causes.dropna()
causes = causes.tolist()

causes_stay = ["contact with patient", "overseas inflow", "etc"]
for i, cause in enumerate(causes):
    if cause in causes_stay:
        continue
    causes[i] = "community infection"
    
causes = list(dict.fromkeys(causes))
print(causes)

['community infection', 'etc', 'contact with patient', 'overseas inflow']


### 3. 클래스

In [5]:
class GridMap:
    Re = 6371.00877  ##  지도반경
    grid = 1.28  ##  격자간격 (km)
    slat1 = 30.0  ##  표준위도 1
    slat2 = 60.0  ##  표준위도 2
    olon = 127.0175205  ##  기준점 경도
    olat = 37.523124  ##  기준점 위도
    xo = 18.4 / grid #  기준점 X좌표
    yo = 21.6 / grid  ##  기준점 Y좌표
    row = 32
    column = 32
    seoul_start_lat = 37.698098
    seoul_start_lon = 126.799791
    seoul_end_lat = 37.34815
    seoul_end_lon = 127.23525

    def __init__(self):
        self.PI = math.asin(1.0) * 2.0
        self.DEGRAD = self.PI / 180.0
        self.RADDEG = 180.0 / self.PI

        self.re = self.Re / self.grid
        self.slat1 = self.slat1 * self.DEGRAD
        self.slat2 = self.slat2 * self.DEGRAD
        self.olon = self.olon * self.DEGRAD
        self.olat = self.olat * self.DEGRAD

        self.sn = math.tan(self.PI * 0.25 + self.slat2 * 0.5) / \
                  math.tan(self.PI * 0.25 + self.slat1 * 0.5)
        self.sn = math.log(math.cos(self.slat1) /
                           math.cos(self.slat2)) / math.log(self.sn)
        self.sf = math.tan(self.PI * 0.25 + self.slat1 * 0.5)
        self.sf = math.pow(self.sf, self.sn) * math.cos(self.slat1) / self.sn
        self.ro = math.tan(self.PI * 0.25 + self.olat * 0.5)
        self.ro = self.re * self.sf / math.pow(self.ro, self.sn)

    def toGrid(self, lat, lon):
        lat = float(lat)
        lon = float(lon)
        ra = math.tan(self.PI * 0.25 + lat * self.DEGRAD * 0.5)
        ra = self.re * self.sf / pow(ra, self.sn)
        theta = lon * self.DEGRAD - self.olon
        if theta > self.PI:
            theta -= 2.0 * self.PI
        if theta < self.PI * -1:
            theta += 2.0 * self.PI
        theta *= self.sn
        x = (ra * math.sin(theta)) + self.xo
        y = (self.ro - ra * math.cos(theta)) + self.yo
        x = int(x + 1.5)
        # y = int(y + 1.5)
        y = (int(y + 1.5) - 31) * -1
        return x, y

    def toCoordinate(self, x, y):
        x = x - 1
        y = y - 1
        xn = x - self.xo
        yn = self.ro - y + self.yo
        ra = math.sqrt(xn * xn + yn * yn)
        if self.sn < 0.0:
            ra = -ra
        alat = math.pow((self.re * self.sf / ra), (1.0 / self.sn))
        alat = 2.0 * math.atan(alat) - self.PI * 0.5
        if math.fabs(xn) <= 0.0:
            theta = 0.0
        else:
            if math.fabs(yn) <= 0.0:
                theta = self.PI * 0.5
                if xn < 0.0:
                    theta = -theta
            else:
                theta = math.atan2(xn, yn)
        alon = theta / self.sn + self.olon
        lat = alat * self.RADDEG
        lon = alon * self.RADDEG
        return lat, lon

    def isSeoul(self, lat, lon):
        if self.seoul_end_lat > lat or lat > self.seoul_start_lat:
            return False
        if self.seoul_end_lon < lon or lon < self.seoul_start_lon:
            return False
        return True

    def generateMap(self, raw_data):
        gridx, gridy = [], []
        for idx, data in raw_data.iterrows():
            try:
                float(data.latitude)
            except ValueError:
                continue
            if self.isSeoul(data.latitude, data.longitude):
                x, y = self.toGrid(data.latitude, data.longitude)
                gridx.append(x), gridy.append(y)

        df = pd.DataFrame({'gridx': gridx, 'gridy': gridy})

        grid_array = np.zeros((self.row, self.column))
        for idx, data in df.iterrows():
            try:
                grid_array[data.gridy, data.gridx] += 1
            except:
                pass

        return grid_array

In [6]:
class Feature:
    def __init__(self, names, counts, move_types, input_route, input_dates):
        self.names = names
        self.counts = counts
        self.move_types = move_types
        self.clear(len(input_route))
        
    def clear(self, input_dates):
        self.features = []
        all_count = sum(count for count in self.counts)
        self.tensor = np.zeros((input_dates, 32, 32, all_count), dtype=np.float)
        for i in range(all_count):
            arr = np.zeros((32, 32))
            self.features.append(arr)
            
    def age_category(self, ages):
        categories = []
        for age in ages:
            age = int(age[:-1])
            if age == 0: category = 0
            elif age == 100: category = 10
            else: category = age // 10
            categories.append(category)
        return categories
            
    def sex_category(self, sexes):
        categories = []
        for sex in sexes:
            if sex == 'male': categories.append(0)
            else: categories.append(1)
        return categories
        
    def infection_case_category(self, infection_cases):
        categories = []
        for infection_case in infection_cases:
            if infection_case not in causes:
                infection_case = "community infection"
            category = causes.index(infection_case)
            categories.append(category)
        return categories
    
    def type_category(self, types):
        categories = []
        for purpose in types:
            category = self.move_types.index(purpose)
            categories.append(category)
        return categories
    
    def day_category(self, days):
        categories = []
        for day in days:
            day = datetime.strptime(day, "%Y-%m-%d")
            category = day.weekday()
            categories.append(category)
        return categories
    
    def put_all_categories(self, order, ages, sexes, infection_cases, types, days, lats, lons):
        converter = GridMap()
        length = len(ages)
        
        ages_categories = self.age_category(ages)
        sexes_categories = self.sex_category(sexes)
        infection_case_categories = self.infection_case_category(infection_cases)
        type_categories = self.type_category(types)
        days_categories = self.day_category(days)
        
        # age_categories = 3이라고 했을 때
        # 전체 feature 배열에서 index가 3인 배열에 넣으면 됨
        for i in range(length):
            x, y = converter.toGrid(lats[i], lons[i])
            index = 0
            # age
            if ages_categories[i] != -1:
                age = ages_categories[i]
                self.tensor[order][x][y][age] += 1
                index += self.counts[0]
            # sex
            if sexes_categories[i] != -1:
                sex = index + sexes_categories[i]
                self.tensor[order][x][y][sex] += 1
                index += self.counts[1]
            # infection_case
            if infection_case_categories[i] != -1:
                infection_case = index + infection_case_categories[i]
                self.tensor[order][x][y][infection_case] += 1
                index += self.counts[2]
            # type
            if type_categories[i] != -1:
                visit = index + type_categories[i]
                self.tensor[order][x][y][visit] += 1
                index += self.counts[3]
            # day
            if days_categories[i] != -1:
                day = index + days_categories[i]
                self.tensor[order][x][y][day] += 1

### 4. 메서드, date 변수

In [7]:
def generate_images(feature_class, input_route, input_dates):  
    feature_class.clear(len(input_dates))
    images = []
    for i in range(len(input_dates)):
        routes = input_route.loc[input_route['date'] == input_dates[i]]

        # features in routes
        ages = routes['age'].tolist()
        sexes = routes['sex'].tolist()
        infection_cases = routes['infection_case'].tolist()
        types = routes['type'].tolist()
        dates = routes['date'].tolist()
        lats = routes['latitude'].tolist()
        lons = routes['longitude'].tolist()
        
        # features
        feature_class.put_all_categories(i, ages, sexes, infection_cases, types, dates, lats, lons)
        
    return feature_class.tensor

In [8]:
route_dates = merge_route['date']
route_dates = route_dates.drop_duplicates(keep='last')
route_dates = route_dates.tolist()
route_dates.sort()

In [9]:
club_dates = club_route['date']
club_dates = club_dates.drop_duplicates(keep='last')
club_dates = club_dates.tolist()
club_dates.sort()
club_dates

['2020-05-01',
 '2020-05-02',
 '2020-05-03',
 '2020-05-04',
 '2020-05-05',
 '2020-05-06',
 '2020-05-07',
 '2020-05-08',
 '2020-05-09']

### 5. 피처에 값 대입
##### 방문목적지별 방문 개수
- hospital(970)
- etc(648)
- public transportation(259)
- store(217)
- restuarant(203)
- pharmacy(75)
- church(64)
- cafe(45)
- airport(28)
- pc_cafe(24)
- lodging(24)
- real_estate_agency(16)
- bank(11)
- school(11)
- bar(9)
- beuty_salon(6)
- post_office(5)
- bakery(6)
- gym(3)
- gas_station(3)
- karaoke(2)

##### 피처 병합 계획
- move types의 수를 줄인다.
  - store(217)로 분류될 수 있는 속성들은 store로 대체한다.
    - bakery(6), pharmacy(75) → store
  - restaurant로 분류될 수 있는 속성들은 restuarant(203)로 대체한다.
    - cafe(45), bar(9)
  - 실내 인구 밀집도가 높은 지역은 'high_population'으로 대체한다.
    - karaoke(2), gym(3), pc_cafe(24), church(64)
  - 통계적으로 다른 방문지보다 방문 횟수가 적은 곳은 'etc'로 대체한다.
    - gas_station(3), beuty_salon(6), school(11), real_estate_agency(16), airport(28), bank(11), post_office(5)
  - 남은 것
    - lodging(24), public transportation(259), etc(648), hospital(970), store

In [10]:
images = []
club_images = []

### 5. 모든 피처 반영

모든 categorical values를 1개의 피처로 반영

In [11]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 4, 21, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [12]:
feature = Feature(names, counts, move_types, merge_route, route_dates)
image0 = generate_images(feature, merge_route, route_dates)
images.append(image0)

feature = Feature(names, counts, move_types, club_route, club_dates)
club0 = generate_images(feature, club_route, club_dates)
club_images.append(club0)

### 5. 피처 변형1 -  요일(1)

- 월~금, 토~일

In [13]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 4, 21, 2]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [14]:
class FeatureDay1(Feature):
    def day_category(self, days):
        categories = []
        for day in days:
            day = datetime.strptime(day, "%Y-%m-%d")
            category = day.weekday()
            if category == 5 or category == 6: categories.append(0)
            else: categories.append(1)
        return categories

In [15]:
feature = FeatureDay1(names, counts, move_types, merge_route, route_dates)
image1 = generate_images(feature, merge_route, route_dates)
images.append(image1)

feature = FeatureDay1(names, counts, move_types, club_route, club_dates)
club1 = generate_images(feature, club_route, club_dates)
club_images.append(club1)

### 5. 피처 변형1 - 요일(2)

- 요일 삭제

In [16]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 4, 21, 0]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [17]:
class FeatureDay2(Feature):
    def day_category(self, days):
        categories = []
        for day in days:
            categories.append(-1)
        return categories

In [18]:
feature = FeatureDay2(names, counts, move_types, merge_route, route_dates)
image2 = generate_images(feature, merge_route, route_dates)
images.append(image2)

feature = FeatureDay2(names, counts, move_types, club_route, club_dates)
club2 = generate_images(feature, club_route, club_dates)
club_images.append(club2)

### 5. 피처 변형2 - 나이대(1)

- 0~1/2~3/4~5/6~7/8~10

In [19]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [5, 2, 4, 21, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [20]:
class FeatureAge1(Feature):
    def agey_category(self, days):
        categories = []
        for age in ages:
            age = int(age[:-1])
            if age == 0 or age == 1: category = 0
            elif age == 2 or age == 3: category = 1
            elif age == 4 or age == 5: category = 2
            elif age == 6 or age == 7: category = 3
            else: category = 4
            categories.append(category)
        return categories

In [21]:
feature = FeatureAge1(names, counts, move_types, merge_route, route_dates)
image3 = generate_images(feature, merge_route, route_dates)
images.append(image3)

feature = FeatureAge1(names, counts, move_types, club_route, club_dates)
club3 = generate_images(feature, club_route, club_dates)
club_images.append(club3)

### 피처 변형2 - 나이대(2)

- 0/1~3/4~6/7~10

In [22]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [4, 2, 4, 21, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [23]:
class FeatureAge2(Feature):
    def age_category(self, ages):
        categories = []
        for age in ages:
            age = int(age[:-1])
            if age == 0: category = 0
            elif age >= 1 or age < 4: category = 1
            elif age >= 4 or age < 7: category = 2
            else: category = 3
            categories.append(category)
        return categories

In [24]:
feature = FeatureAge2(names, counts, move_types, merge_route, route_dates)
image4 = generate_images(feature, merge_route, route_dates)
images.append(image4)

feature = FeatureAge2(names, counts, move_types, club_route, club_dates)
club4 = generate_images(feature, club_route, club_dates)
club_images.append(club4)

### 피처 변형2 - 나이대(3)

- 나이 삭제

In [25]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [0, 2, 4, 21, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [26]:
class FeatureAge3(Feature):
    def age_category(self, ages):
        categories = []
        for age in ages:
            categories.append(-1)
        return categories

In [27]:
feature = FeatureAge3(names, counts, move_types, merge_route, route_dates)
image5 = generate_images(feature, merge_route, route_dates)
images.append(image5)

feature = FeatureAge3(names, counts, move_types, club_route, club_dates)
club5 = generate_images(feature, club_route, club_dates)
club_images.append(club5)

### 피처 변형3 - 성별(1)

- 성별 삭제

In [28]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 0, 4, 21, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [29]:
class FeatureSex1(Feature):
    def sex_category(self, sexes):
        categories = []
        for sex in sexes:
            categories.append(-1)
        return categories

In [30]:
feature = FeatureSex1(names, counts, move_types, merge_route, route_dates)
image6 = generate_images(feature, merge_route, route_dates)
images.append(image6)

feature = FeatureSex1(names, counts, move_types, club_route, club_dates)
club6 = generate_images(feature, club_route, club_dates)
club_images.append(club6)

### 피처 변형4 - 감염 원인(1)

- 해외/접촉+기타/집단
'community infection', 'etc', 'contact with patient', 'overseas inflow'

In [31]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 3, 21, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [32]:
new_infection_case = ['community infection', 'contact with patient', 'overseas inflow']

In [33]:
class FeatureCase1(Feature):
    def infection_case_category(self, infection_cases):
        categories = []
        for infection_case in infection_cases:
            if infection_case not in causes:
                infection_case = "community infection"
            if infection_case == "etc":
                infection_case = "contact with patient"
            category = new_infection_case.index(infection_case)
            
            categories.append(category)
        return categories

In [34]:
feature = FeatureCase1(names, counts, move_types, merge_route, route_dates)
image7 = generate_images(feature, merge_route, route_dates)
images.append(image7)

feature = FeatureCase1(names, counts, move_types, club_route, club_dates)
club7 = generate_images(feature, club_route, club_dates)
club_images.append(club7)

### 5. 피처 변형 4 - 감염 원인(2)

- 해외/접촉+기타/집단
'community infection', 'etc', 'contact with patient', 'overseas inflow'

In [35]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 3, 21, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [36]:
new_infection_case = ['community infection', 'contact with patient', 'overseas inflow']

In [37]:
class FeatureCase2(Feature):
    def infection_case_category(self, infection_cases):
        categories = []
        for infection_case in infection_cases:
            if infection_case not in causes:
                infection_case = "community infection"
            if infection_case == "etc":
                infection_case = "community infection"
            category = new_infection_case.index(infection_case)
            
            categories.append(category)
        return categories

In [38]:
feature = FeatureCase2(names, counts, move_types, merge_route, route_dates)
image8 = generate_images(feature, merge_route, route_dates)
images.append(image8)

feature = FeatureCase2(names, counts, move_types, club_route, club_dates)
club8 = generate_images(feature, club_route, club_dates)
club_images.append(club8)

### 피처 변형4 - 감염 원인(3)

- 해외 삭제
'community infection', 'etc', 'contact with patient', 'overseas inflow'

In [39]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 3, 21, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [40]:
new_infection_case = ['community infection', 'etc', 'contact with patient']

In [41]:
class FeatureCase3(Feature):
    def infection_case_category(self, infection_cases):
        categories = []
        for infection_case in infection_cases:
            if 'overseas inflow':
                categories.append(-1)
                continue
            if infection_case not in causes:
                infection_case = "community infection"
            category = new_infection_case.index(infection_case)
            
            categories.append(category)
        return categories

In [42]:
feature = FeatureCase3(names, counts, move_types, merge_route, route_dates)
image9 = generate_images(feature, merge_route, route_dates)
images.append(image9)

feature = FeatureCase3(names, counts, move_types, club_route, club_dates)
club9 = generate_images(feature, club_route, club_dates)
club_images.append(club9)

### 5. 피처 변형4 - 감염 원인(4)

- 감염 원인 삭제

In [43]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 0, 21, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [44]:
class FeatureCase4(Feature):
    def infection_case_category(self, infection_cases):
        categories = []
        for infection_case in infection_cases:
            categories.append(-1)
        return categories

In [45]:
feature = FeatureCase4(names, counts, move_types, merge_route, route_dates)
image10 = generate_images(feature, merge_route, route_dates)
images.append(image10)

feature = FeatureCase4(names, counts, move_types, club_route, club_dates)
club10 = generate_images(feature, club_route, club_dates)
club_images.append(club10)

### 5. 피처 변형5 - 방문 목적(1)

방문 목적이 비슷한 항몪끼리 묶는다.
- store(217)로 분류될 수 있는 속성들은 store로 대체한다.
    - bakery(6), pharmacy(75) → store
    - 5월 12일까지 약국에서 일어난 2차 감염 사례가 보고되지 않았기 때문에 약국도 store로 분류

In [46]:
patient_route = pd.read_csv('covid19/PatientRoute.csv')
patient_info = pd.read_csv('covid19/PatientInfo.csv')
patient_info = patient_info[["patient_id","global_num","sex","age","infection_case"]]
club_route = pd.read_csv('club.csv')
merge_route = pd.merge(patient_info, patient_route)
merge_route = merge_route[merge_route["province"]=="Seoul"]
merge_route = merge_route[pd.notnull(merge_route['sex'])]
merge_route = merge_route[pd.notnull(merge_route['age'])]
club_route = club_route[pd.notnull(club_route['latitude'])]
club_route = club_route[pd.notnull(club_route['longitude'])]

In [47]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 4, 19, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'airport']

In [48]:
second_route = copy.deepcopy(merge_route)# merge_route.copy(deep=False)
second_route['type'].replace({'bakery': 'store', 'pharmacy': 'store'}, inplace=True)
club_second_route = club_route
club_second_route['type'].replace({'bakery': 'store', 'pharmacy': 'store'}, inplace=True)

In [49]:
feature = Feature(names, counts, move_types, second_route, route_dates)
image11 = generate_images(feature, second_route, route_dates)
images.append(image11)

feature = Feature(names, counts, move_types, club_second_route, club_dates)
club11 = generate_images(feature, club_second_route, club_dates)
club_images.append(club11)

### 5. 피처 변형5 - 방문 목적(2)

방문 목적이 비슷한 항몪끼리 묶는다.
- restaurant로 분류될 수 있는 속성들은 restuarant(203)로 대체한다.
  - cafe(45), bar(9)

In [50]:
patient_route = pd.read_csv('covid19/PatientRoute.csv')
patient_info = pd.read_csv('covid19/PatientInfo.csv')
patient_info = patient_info[["patient_id","global_num","sex","age","infection_case"]]
club_route = pd.read_csv('club.csv')
merge_route = pd.merge(patient_info, patient_route)
merge_route = merge_route[merge_route["province"]=="Seoul"]
merge_route = merge_route[pd.notnull(merge_route['sex'])]
merge_route = merge_route[pd.notnull(merge_route['age'])]
club_route = club_route[pd.notnull(club_route['latitude'])]
club_route = club_route[pd.notnull(club_route['longitude'])]

In [51]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 4, 19, 7]
move_types = ['karaoke', 'gas_station', 'gym', 'bakery', 'pc_cafe',
              'beauty_salon', 'school', 'church', 'bank',
              'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [52]:
third_route = copy.deepcopy(merge_route) #merge_route.copy(deep=False)
third_route['type'].replace({'cafe': 'restaurant', 'bar': 'restaurant'}, inplace=True)
club_third_route = club_route
club_third_route['type'].replace({'cafe': 'restaurant', 'bar': 'restaurant'}, inplace=True)

In [53]:
feature = Feature(names, counts, move_types, third_route, route_dates)
image12 = generate_images(feature, third_route, route_dates)
images.append(image12)

feature = Feature(names, counts, move_types, club_third_route, club_dates)
club12 = generate_images(feature, club_third_route, club_dates)
club_images.append(club12)

### 5. 피처 변형5 - 방문 목적(3)

실내 환경이 비슷한 항몪끼리 묶는다.
- 실내 인구 밀집도가 높고 폐쇄된 지역은 'high_population'으로 대체한다.
  - karaoke(2), gym(3), pc_cafe(24), church(64)

In [54]:
patient_route = pd.read_csv('covid19/PatientRoute.csv')
patient_info = pd.read_csv('covid19/PatientInfo.csv')
patient_info = patient_info[["patient_id","global_num","sex","age","infection_case"]]
club_route = pd.read_csv('club.csv')
merge_route = pd.merge(patient_info, patient_route)
merge_route = merge_route[merge_route["province"]=="Seoul"]
merge_route = merge_route[pd.notnull(merge_route['sex'])]
merge_route = merge_route[pd.notnull(merge_route['age'])]
club_route = club_route[pd.notnull(club_route['latitude'])]
club_route = club_route[pd.notnull(club_route['longitude'])]

In [55]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 4, 19, 7]
move_types = ['gas_station', 'bakery', 'high_population',
              'beauty_salon', 'school', 'bank', 'cafe',
              'bar', 'post_office', 'real_estate_agency', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy', 'airport']

In [56]:
forth_route = copy.deepcopy(merge_route) #merge_route.copy(deep=False)
forth_route['type'].replace({'karaoke': 'high_population', 'gym': 'restaurant', 'pc_cafe': 'high_population', 'church': 'high_population'}, inplace=True)
club_forth_route = club_route
club_forth_route['type'].replace({'karaoke': 'high_population', 'gym': 'restaurant', 'pc_cafe': 'high_population', 'church': 'high_population'}, inplace=True)

In [57]:
feature = Feature(names, counts, move_types, forth_route, route_dates)
image13 = generate_images(feature, forth_route, route_dates)
images.append(image13)

feature = Feature(names, counts, move_types, club_forth_route, club_dates)
club13 = generate_images(feature, club_forth_route, club_dates)
club_images.append(club13)

### 5. 피처 변형5 - 방문 목적(4)

- 통계적으로 다른 방문지보다 방문 횟수가 적은 곳은 'etc'로 대체한다.
  - gas_station(3), beuty_salon(6), school(11), real_estate_agency(16), airport(28), bank(11), post_office(5)

In [58]:
patient_route = pd.read_csv('covid19/PatientRoute.csv')
patient_info = pd.read_csv('covid19/PatientInfo.csv')
patient_info = patient_info[["patient_id","global_num","sex","age","infection_case"]]
club_route = pd.read_csv('club.csv')
merge_route = pd.merge(patient_info, patient_route)
merge_route = merge_route[merge_route["province"]=="Seoul"]
merge_route = merge_route[pd.notnull(merge_route['sex'])]
merge_route = merge_route[pd.notnull(merge_route['age'])]
club_route = club_route[pd.notnull(club_route['latitude'])]
club_route = club_route[pd.notnull(club_route['longitude'])]

In [59]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 4, 14, 7]
move_types = ['karaoke', 'gym', 'bakery', 'pc_cafe',
              'church', 'cafe',
              'bar', 'lodging',
              'public_transportation', 'restaurant', 'etc', 'store',
              'hospital', 'pharmacy']

In [60]:
fifth_route = copy.deepcopy(merge_route) #merge_route.copy(deep=False)
fifth_route['type'].replace({'gas_station': 'etc', 'beauty_salon': 'etc', 'school': 'etc', 'real_estate_agency': 'etc', 'airport': 'etc', 'bank': 'etc', 'post_office': 'etc'}, inplace=True)
club_fifth_route = club_route
club_fifth_route['type'].replace({'gas_station': 'etc', 'beauty_salon': 'etc', 'school': 'etc', 'real_estate_agency': 'etc', 'airport': 'etc', 'bank': 'etc', 'post_office': 'etc'}, inplace=True)

In [61]:
feature = Feature(names, counts, move_types, fifth_route, route_dates)
image14 = generate_images(feature, fifth_route, route_dates)
images.append(image14)

feature = Feature(names, counts, move_types, club_fifth_route, club_dates)
club14 = generate_images(feature, club_fifth_route, club_dates)
club_images.append(club14)

### 5. 피처 변형5 - 방문 목적(5)

- 방문 목적 전체 삭제

In [62]:
patient_route = pd.read_csv('covid19/PatientRoute.csv')
patient_info = pd.read_csv('covid19/PatientInfo.csv')
patient_info = patient_info[["patient_id","global_num","sex","age","infection_case"]]
club_route = pd.read_csv('club.csv')
merge_route = pd.merge(patient_info, patient_route)
merge_route = merge_route[merge_route["province"]=="Seoul"]
merge_route = merge_route[pd.notnull(merge_route['sex'])]
merge_route = merge_route[pd.notnull(merge_route['age'])]
club_route = club_route[pd.notnull(club_route['latitude'])]
club_route = club_route[pd.notnull(club_route['longitude'])]
club_route = club_route[club_route["date"]!="2020-04-20"]

In [63]:
names = ['age', 'sex', 'infection_case', 'type', 'date']
counts = [11, 2, 4, 0, 7]
move_types = []

In [64]:
sixth_route = merge_route
club_sixth_route = club_route

In [65]:
class FeatureType1(Feature):
    def type_category(self, types):
        categories = []
        for purpose in types:
            categories.append(-1)
        return categories

In [66]:
feature = FeatureType1(names, counts, move_types, sixth_route, route_dates)
image15 = generate_images(feature, sixth_route, route_dates)
images.append(image15)

feature = FeatureType1(names, counts, move_types, club_sixth_route, club_dates)
club15 = generate_images(feature, club_fifth_route, club_dates)
club_images.append(club15)

### 6. ConvLSTM

In [72]:
from keras.models import Sequential, load_model
from keras.layers.convolutional import Conv3D
from keras.layers.convolutional_recurrent import ConvLSTM2D
from keras.layers.normalization import BatchNormalization
import keras.backend.tensorflow_backend as K
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pylab as plt

Using TensorFlow backend.

Bad key "text.kerning_factor" on line 4 in
C:\Users\Yejin Kim\anaconda3\envs\AutoCOVID19\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [73]:
rs = 32
n_step = 3
n_test = 1

In [74]:
# We create a layer which take as input movies of shape
# (n_frames, width, height, channels) and returns a movie
# of identical shape.
def get_model(channel):
    with K.tf_ops.device('/device:GPU:0'):
        seq = Sequential()
        seq.add(ConvLSTM2D(filters=rs, kernel_size=(3, 3),
                           input_shape=(n_step, rs, rs, channel),
                           padding='same', return_sequences=True))
        seq.add(BatchNormalization())

        seq.add(ConvLSTM2D(filters=rs, kernel_size=(3, 3),
                           padding='same', return_sequences=True))
        seq.add(BatchNormalization())

        seq.add(ConvLSTM2D(filters=rs, kernel_size=(3, 3),
                           padding='same', return_sequences=True))
        seq.add(BatchNormalization())

        seq.add(ConvLSTM2D(filters=rs, kernel_size=(3, 3),
                           padding='same', return_sequences=True))
        seq.add(BatchNormalization())

        seq.add(Conv3D(filters=channel, kernel_size=(3, 3, 3),
                       activation='sigmoid',
                       padding='same', data_format='channels_last'))
        
        seq.compile(loss='rmsprop', optimizer='mean_squared_error')
#         seq.compile(loss='binary_crossentropy', optimizer='adadelta')
    return seq

#### train/test

In [None]:
for j, image in enumerate(images):
    n = image.shape[0]
    channel = image.shape[3]

    n_test = n_step * 2
    n_train = n - n_test

    train = image
    
    X_train = []
    y_train = []
    for i in range(n_step,n_train-n_step):
        X_train.append(train[i-n_step:i, :,:])
        y_train.append(train[i:i+n_step, :,:])
    X_train, y_train = np.array(X_train), np.array(y_train)
    
    seq = get_model(channel)
    seq.fit(X_train, y_train, epochs = 200, batch_size = 32)
    seq.save('model/train_seq_image%d.h5' % j)

In [None]:
---

#### Result

In [None]:
for i, image in enumerate(images):
    n = image.shape[0]
    channel = image.shape[3]
    rs = 32
    n_step = 14

    n_test = n_step * 2
    n_train = n - n_test
    
    seq = load_model('model/seq_image%d_2.h5' % i)
    m_max=300
    
    test = tests[i]
    X_test=test[-n_step:,:,:]
    X_test=X_test.reshape((1, 14, rs, rs, channel))
    club_image = club_images[i] 
    y_test = club_image[-n_step:,:,:]
    print(X_test.shape)
    print(y_test.shape)
                          
    #prediction
    pred =seq.predict(X_test)
    pred*=m_max ###########inverse scailing
    print(pred.shape)
    pred=pred.reshape(n_step,rs,rs, channel)
                          
    real_vv=[]
    pre_vv=[]
    for t in range(n_step):
        tmp=y_test[t].reshape((rs,rs, channel))
        xx=np.argwhere(False==np.isnan(tmp))[:,0]
        yy=np.argwhere(False==np.isnan(tmp))[:,1]
        real_vv.append(tmp[xx,yy])
        pre_vv.append(pred[t,xx,yy])

    real_vv=np.array(real_vv)
    pre_vv=np.array(pre_vv)
    print(real_vv.shape)
    print(pre_vv.shape)

    for i in range(14):
        rmse=np.sqrt(mean_squared_error(real_vv[i],pre_vv[i]))
        print(i+1," : ",'Test RMSE: %.3f'%rmse)

In [None]:
---

In [None]:
model = seq
m_max=300

X_test=test[-n_step:,:,:]
X_test=X_test.reshape((1,12, rs, rs, 1))
y_test=cimdata[-n_step:,:,:]
print(X_test.shape)
print(y_test.shape)

#prediction
pred =model.predict(X_test)
pred*=m_max ###########inverse scailing
print(pred.shape)
pred=pred.reshape(n_step,rs,rs)

real_vv=[]
pre_vv=[]
for t in range(n_step):
    tmp=y_test[t].reshape((rs,rs))
    xx=np.argwhere(False==np.isnan(tmp))[:,0]
    yy=np.argwhere(False==np.isnan(tmp))[:,1]
    real_vv.append(tmp[xx,yy])
    pre_vv.append(pred[t,xx,yy])

real_vv=np.array(real_vv)
pre_vv=np.array(pre_vv)
print(real_vv.shape)
print(pre_vv.shape)

for i in range(12):
    rmse=np.sqrt(mean_squared_error(real_vv[i],pre_vv[i]))
    print(i+1," : ",'Test RMSE: %.3f'%rmse)

In [None]:
image10.shape

In [None]:
n = image10.shape[0]
rs = 32
n_step = 14

n_test = n_step * 2
n_train = n - n_test

train = image10[:n_train,:,:]
test = image10[n_train:,:,:]

print(train.shape)
print(test.shape)

In [None]:
X_train = []
y_train = []
for i in range(n_step,n_train-n_step):
    X_train.append(train[i-n_step:i, :,:])
    y_train.append(train[i:i+n_step, :,:])
X_train, y_train = np.array(X_train), np.array(y_train)

print(X_train.shape)
print(y_train.shape)

In [None]:
channel = image10.shape[3]

In [None]:
seq = get_model(0, channel)

In [None]:
seq.fit(X_train, y_train, epochs = 50, batch_size = 32)

In [None]:
seq.save('seq_image10.h5')

#### Result

In [None]:
model = seq
m_max=300

X_test=test[-n_step:,:,:]
X_test=X_test.reshape((1, 14, rs, rs, channel))
# y_test=cimdata[-n_step:,:,:]
print(X_test.shape)
# print(y_test.shape)

#prediction
pred =model.predict(X_test)
pred*=m_max ###########inverse scailing
print(pred.shape)
pred=pred.reshape(n_step, rs, rs, channel)

real_vv=[]
pre_vv=[]
for t in range(n_step):
    tmp=y_test[t].reshape((rs, rs, channel))
    xx=np.argwhere(False==np.isnan(tmp))[:,0]
    yy=np.argwhere(False==np.isnan(tmp))[:,1]
    real_vv.append(tmp[xx,yy])
    pre_vv.append(pred[t,xx,yy])

real_vv=np.array(real_vv)
pre_vv=np.array(pre_vv)
print(real_vv.shape)
print(pre_vv.shape)

for i in range(12):
    rmse=np.sqrt(mean_squared_error(real_vv[i],pre_vv[i]))
    print(i+1," : ",'Test RMSE: %.3f'%rmse)

In [None]:
train=train.reshape((train.shape[0],n_step,rs,rs,45))
print(train.shape)
test=test.reshape((test.shape[0],rs,rs,1))
print(test.shape)

In [None]:
---
# (n_frames, width, height, channels) and returns a movie

(17484, 12, 50, 50)
(17484, 1, 50, 50)

(17484, 12, 50, 50, 1)
(17484, 50, 50, 1)

In [None]:
X_train=X_train.reshape((X_train.shape[0],14,256,256,45))
print(X_train.shape)
y_train=y_train3.reshape((y_train.shape[0],256,256,45))
print(y_train.shape)

In [None]:
seq = get_model(0, image1.shape[2])

In [None]:
seq.summary()

In [None]:
history = seq.fit(noisy_movies[:1000], shifted_movies[:1000], batch_size=10,
        epochs=50, validation_split=0.05)

In [None]:
model_dir = './model'
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

model_path = model_dir + '/covid%d.h5' % number
seq.save(model_path)

In [None]:
print("정확도 : %.4f" % (model.evaluate()[1]))
print("손실 : %.4f" % (model.evaluate()[0]))