# GPS data preprocessing

## 0. 원본(raw) 데이터는 ETRI AIfactory 사이트에서 획득할 수 있습니다.

>https://nanum.etri.re.kr/share/schung1/ETRILifelogDataset2020?lang=ko_KR

## 1. Installing libraries and importing packages

In [23]:
pip install datatable

Defaulting to user installation because normal site-packages is not writeable
Collecting datatable
  Downloading datatable-1.0.0-cp36-cp36m-manylinux_2_12_x86_64.whl (96.6 MB)
     |████████████████████████████████| 96.6 MB 94 kB/s              
[?25hInstalling collected packages: datatable
Successfully installed datatable-1.0.0
Note: you may need to restart the kernel to use updated packages.


In [49]:
pip install haversine

Defaulting to user installation because normal site-packages is not writeable
Collecting haversine
  Downloading haversine-2.8.0-py2.py3-none-any.whl (7.7 kB)
Installing collected packages: haversine
Successfully installed haversine-2.8.0
Note: you may need to restart the kernel to use updated packages.


In [700]:
pip install tqdm

Defaulting to user installation because normal site-packages is not writeable
Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
     |████████████████████████████████| 78 kB 1.7 MB/s             
[?25hCollecting importlib-resources
  Downloading importlib_resources-5.4.0-py3-none-any.whl (28 kB)
Collecting zipp>=3.1.0
  Downloading zipp-3.6.0-py3-none-any.whl (5.3 kB)
Installing collected packages: zipp, importlib-resources, tqdm
Successfully installed importlib-resources-5.4.0 tqdm-4.64.1 zipp-3.6.0
Note: you may need to restart the kernel to use updated packages.


In [950]:
import os
import datatable as dt
import pandas as pd
import numpy as np
import re
from haversine import haversine
from datetime import datetime
from tqdm import tqdm
import time
import pandas as pd

## 2. Generating functions for preprocessing

## 2-1. extractAllUserDistanceOfYear(year)

> 년도를 파라미터로 입력하면 해당년도의 모든 대상자에 대한 gps date를 numpy ndarray 형태로 반환<br>
> 대상자 ID에 포함된 숫자, timestamp에 대하여 오름차순으로 정렬하여 반환

In [460]:
def extractAllUserDistanceOfYear(year) :
    
    if year in [2018, 2019] :
        
        pathYear = f'dataset_{year}'
        userNameList = os.listdir(f'/mnt/data/tkeo12/{pathYear}/{pathYear}')
        userNameList.sort(key = int)
        
        userNamePathList = list(map(lambda x : f'/mnt/data/tkeo12/{pathYear}/{pathYear}/' + x, userNameList))
        
    elif year == 2020 :
        
        pattern = "^user[0-9]+-[0-9]+$"
        prog = re.compile(pattern)
        
        userDirList = os.listdir(f'/mnt/data/tkeo12')
        userDirList = [i for i in userDirList if prog.search(i) != None]
        userDirList.sort(key = lambda x : int(x[-2:]))
        
        userNamePathList = []
        
        for userDir in userDirList :
            
            userNameList = os.listdir(f'/mnt/data/tkeo12/{userDir}/{userDir}')
            userNameList.sort(key = lambda x : int(x[-2:]))
            
            userNamePathList = userNamePathList + [f'/mnt/data/tkeo12/{userDir}/{userDir}/{userName}' for userName in userNameList]
        
    else :
        
        print("You should check inputed year value")
        
        return
    
    result = np.zeros([0, 7])
    
    for userNamePath in userNamePathList :
        
        userName = userNamePath.split('/')[-1]
        oneUserGpsData = np.zeros([0, 6])
        
        dayCodeList = os.listdir(f'{userNamePath}')
        
        pattern = "^[0-9]+$"
        prog = re.compile(pattern)
        
        dayCodeList = [dayCode for dayCode in dayCodeList if prog.search(dayCode) != None]
        dayCodeList.sort(key = int)
        
        for dayCode in dayCodeList :
            
            date = datetime.fromtimestamp(int(dayCode)).strftime('%Y-%m-%d')
            oneDayGpsData = np.zeros([0, 5])
            
            gpsDataList = os.listdir(f'{userNamePath}/{dayCode}/mGps')
            gpsDataList = [gpsData for gpsData in gpsDataList if gpsData[-4:] == '.csv']
            gpsDataList.sort(key = lambda x : int(x[:-4]))
            
            for idx, gpsDataPath in enumerate(gpsDataList) :
                
                minute = datetime.fromtimestamp(int(gpsDataPath[:-4])).strftime('%H:%M:%S')
                
                oneMinuteGpsData = dt.fread(f'{userNamePath}/{dayCode}/mGps/{gpsDataPath}').to_numpy()
                oneMinuteGpsData = np.hstack((np.full((oneMinuteGpsData.shape[0], 1), minute), oneMinuteGpsData))
                
                oneDayGpsData = np.concatenate((oneDayGpsData, oneMinuteGpsData), axis = 0)
            
            oneDayGpsData = np.hstack((np.full((oneDayGpsData.shape[0], 1), date), oneDayGpsData))
            
            oneUserGpsData = np.concatenate((oneUserGpsData, oneDayGpsData), axis = 0)
            
        oneUserGpsData = np.hstack((np.full((oneUserGpsData.shape[0], 1), userName), oneUserGpsData))
        
        result = np.concatenate((result, oneUserGpsData), axis = 0)
    
    return result

## 2-2. joiningActivityBoolean(gpsFinal, year)

> extractAllUserDistanceOfYear 반환값과 year를 파라미터로 입력하면 해당 년도의 모든 대상자에 대하여<br>
> data_preprocessing_1 에서 생성한 유저별 avtivity timestamp에 해당하는 timestamp에 만을 남김

In [935]:
def joiningActivityBoolean(gpsFinal, year) :
    
    userNameList = np.unique(gpsFinal[:, 0])
    
    if year in [2018, 2019] :
    
        userNameList = sorted(userNameList, key = int)
        
    elif year == 2020 :
        
        userNameList = sorted(userNameList, key = lambda x : int(x[4:]))
    
    else :
        
        print("You should check inputed year value")
        
        return
    
    userNameList = list(userNameList)
    
    result = np.zeros([0, gpsFinal.shape[1]])
    
    for userName in tqdm(userNameList, desc = 'userName', position = 0, leave = True) :
        
        userData = gpsFinal[gpsFinal[:, 0] == userName]

        userArray = userData[:, [1, 2]]
        
        joiningValueArray1 = np.char.add(userArray[:, 0], np.char.add(' ', userArray[:, 1]))
        
        if year == 2018 :
            
            activityMinuteData = dt.fread(f'/mnt/data/tkeo12/exer{year}/user{userName.zfill(2)}_gps.csv').to_numpy()
        
        elif year == 2019 :
            
            activityMinuteData = dt.fread(f'/mnt/data/tkeo12/exer{year}/user{userName[1:]}_gps.csv').to_numpy()
        
        else :
            
            activityMinuteData = dt.fread(f'/mnt/data/tkeo12/{userName}_gps.csv').to_numpy()
        
        joiningValueArray2 = np.array([datetime.fromtimestamp(int(i[0])).strftime('%Y-%m-%d %H:%M:%S') for i in activityMinuteData])
        
        userData = userData[np.in1d(joiningValueArray1, joiningValueArray2)]
        
        result = np.append(result, userData, axis = 0)
        
    return result

## 2-3. haversineApplyByUserDate_ver2(yearData, year, unit = 'm')

> extractAllUserDistanceOfYear 반환값(혹은 joiningActivityBoolean 처리 이후도 가능)과 년도를 파라미터로 넣어주면 <br>
> 해당 년도의 모든 유저들에 대하여 gps data를 기반으로 이동거리를 haversine 방식으로 하루 단위로 산출한 numpy ndarray를 반환

In [939]:
def haversineApplyByUserDate_ver2(yearData, year, unit = 'm') :

    result = np.zeros([0, 3])
    
    users = np.unique(yearData[:, 0])
    
    if year in [2018, 2019] :
    
        users = sorted(users, key = int)
        
    elif year == 2020 :
        
        users = sorted(users, key = lambda x : int(x[4:]))
    
    else :
        
        print("You should check inputed year value")
        
        return
    
    for user in tqdm(users, desc = 'user', position = 0, leave = False) :
        
        user_data = yearData[yearData[:, 0] == user]
        dates = np.unique(user_data[:, 1])
        
        for date in tqdm(dates, desc = 'date', position = 0, leave = False) :
            
            date_data = user_data[user_data[:, 1] == date]
            minutes = np.unique(date_data[:, 2])
            
            resultValue = 0
            
            for minute in tqdm(minutes, desc = 'minute', position = 0, leave = False) :
                
                minute_data = date_data[date_data[:, 2] == minute]
                
                haversine_input = minute_data[:, [4, 5]]
                haversine_input = np.hstack([haversine_input[:-1], haversine_input[1:]])
                haversine_array = np.array([haversine(i[[0, 1]], i[[2, 3]], unit) for i in haversine_input.astype(float)])
                resultValue += np.sum(haversine_array)
            
            result = np.append(result, np.array([user, date, resultValue]).reshape(1, 3), axis = 0)
            
    return result

## 3. Generating final activity distance array from gps data

> gps data로부터 activity distance를 유저별 하루단위로 산출하기 위한 세 가지 함수를 모두 적용하고, 산출된 array를 변수에 지정

In [936]:
a1 = extractAllUserDistanceOfYear(2018)

In [937]:
b1 = joiningActivityBoolean(a1, 2018)

userName: 100%|██████████| 29/29 [00:00<00:00, 35.95it/s]


In [940]:
c1 = haversineApplyByUserDate_ver2(b1, 2018)

                                                     

In [1088]:
c1[:, 0] = np.char.add('user', np.char.zfill(c1[:, 0], 2))

In [1092]:
c1[:10, :]

array([['user01', '2018-11-15', '0.0'],
       ['user01', '2018-11-16', '0.0'],
       ['user01', '2018-11-17', '0.0'],
       ['user01', '2018-11-18', '0.0'],
       ['user01', '2018-11-19', '0.0'],
       ['user01', '2018-11-21', '0.0'],
       ['user01', '2018-11-22', '0.0'],
       ['user01', '2018-11-23', '0.0'],
       ['user01', '2018-11-24', '0.0'],
       ['user01', '2018-11-25', '0.0']], dtype='<U32')

In [941]:
a2 = extractAllUserDistanceOfYear(2019)

In [942]:
b2 = joiningActivityBoolean(a2, 2019)

userName: 100%|██████████| 20/20 [00:00<00:00, 33.85it/s]


In [1118]:
c2 = haversineApplyByUserDate_ver2(b2, 2019)

                                                     

In [1133]:
c2[:, 0] = np.array(['user' + i[1:] for i in c2[:, 0]])

In [1136]:
c2[:10, :]

array([['user01', '2020-01-01', '0.0'],
       ['user01', '2020-01-02', '0.0'],
       ['user01', '2020-01-03', '0.0'],
       ['user01', '2020-01-05', '0.0'],
       ['user01', '2020-01-06', '0.0'],
       ['user01', '2020-01-08', '0.0'],
       ['user01', '2020-01-09', '0.0'],
       ['user01', '2020-01-10', '0.0'],
       ['user01', '2020-01-12', '0.0'],
       ['user01', '2020-01-13', '0.0']], dtype='<U32')

In [1154]:
a3 = extractAllUserDistanceOfYear(2020)

In [1155]:
b3 = joiningActivityBoolean(a3, 2020)

userName: 100%|██████████| 22/22 [00:24<00:00,  1.12s/it]


In [1156]:
c3 = haversineApplyByUserDate_ver2(b3, 2020)

                                                     

In [1157]:
c3[:10, :]

array([['user01', '2020-09-25', '1157.7430115214522'],
       ['user03', '2020-09-13', '1518.9766046103969'],
       ['user03', '2020-09-27', '1184.1323509402796'],
       ['user04', '2020-08-31', '7569.131564074202'],
       ['user04', '2020-09-01', '5214.384421606917'],
       ['user04', '2020-09-03', '4418.739242164301'],
       ['user04', '2020-09-05', '499.0298040056463'],
       ['user04', '2020-09-07', '1265.359267615462'],
       ['user04', '2020-09-10', '4868.7750492021205'],
       ['user04', '2020-09-11', '5284.602057618277']], dtype='<U32')

## 4. Merging activity distance array to lifelog data table

> 최종 activity distance array 를 라이프로그 데이터 테이블에 병합 후 저장

In [1152]:
def SleepExerMaker(year) :
    
    if year == 2018 :
        gpsData = 'c1'
        Left = pd.read_csv(f'/mnt/data/tkeo12/user_sleep_{year}_exer.csv')
    elif year == 2019 :
        gpsData = 'c2'
        Left = pd.read_csv(f'/mnt/data/tkeo12/user_sleep_{year}_exer.csv')
    else :
        gpsData = 'c3'
        Left = pd.read_csv(f'/mnt/data/tkeo12/user_sleep_{year}_exer.csv').rename(columns = {'date_x' : 'date'})
    
    Right = pd.DataFrame(globals()[gpsData]).rename(columns = {0 : 'userName', 1 : 'date', 2 : 'gps_activity_distance'})
    sleepExer = Left.merge(Right, how='left', left_on=['userId', 'date'], right_on=['userName', 'date'])
    
    return sleepExer

In [1159]:
SleepExerMaker(2018).to_csv('/mnt/data/tkeo12/user_sleep_2018_exer1.csv')

In [1161]:
SleepExerMaker(2019).to_csv('/mnt/data/tkeo12/user_sleep_2019_exer1.csv')

In [1162]:
SleepExerMaker(2020).to_csv('/mnt/data/tkeo12/user_sleep_2020_exer1.csv')