In [1]:
import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
import pickle

## Split Original Files

월 단위로 되어있는 데이터를 편의상 일 단위로 변환

In [3]:
dot_csv = re.compile('[0-9]{4}_[0-9]{2}[.]csv')
csv_files = [file for file in os.listdir("traceset1") if dot_csv.match(file)]

In [5]:
def split_file(filename):
    df = pd.read_csv("traceset1/"+filename)
    strt = 0
    prev_time = pd.to_datetime(df.loc[0,'timestamp'])
    for i,time in enumerate(df['timestamp']):
        cur_time = pd.to_datetime(time)
        if(prev_time.day != cur_time.day):
            df[strt:i].to_csv("trace_by_date/"+prev_time.strftime("%Y_%m_%d")+".csv")
            prev_time = cur_time
            strt = i
    df[strt:].to_csv("trace_by_date/"+prev_time.strftime("%Y_%m_%d")+".csv")

In [6]:
for file in csv_files:
    split_file(file)

KeyboardInterrupt: 

## Change Time unit

기존 데이터는 시간을 '년-월-일 시:분:초' 로 저장함.

이를 각 일별로 자정을 기준으로 초단위로 변환

In [2]:
df = pd.read_csv("sample/2014_01_01.csv")

In [3]:
t0 = datetime(2014,1,1)  
df["sec"]=(pd.to_datetime(df["timestamp"]) -t0).dt.total_seconds()

In [8]:
#자동화버전- input:dataframe
def to_sec(df):
    ymd = list(map(int,"2014_01_01.csv".replace(".csv","").split('_')))
    t0 = datetime(ymd[0],ymd[1],ymd[2])
    df["sec"] = (pd.to_datetime(df["timestamp"]) -t0).dt.total_seconds()

## Client Mapping

기존 데이터는 고객을 임의의 string으로 구분함.

이를 숫자로 변환.

In [4]:
c_id = 0
dic = {}

c_id_lst = []
for client in df.client:
        if client in dic:
            c_id_lst.append(dic[client])
        else:
            c_id_lst.append(c_id)
            dic[client] = c_id
            c_id += 1
            
df["ID"] = c_id_lst

In [28]:
# 자동화버전 - input: dataframe
def to_ID(df):
    c_id_lst = []
    # dictionary, lastcount는 공유해야함: pickle
    with open("ID_dictionary.pickle","rb") as fr:
        dic = pickle.load(fr)
    with open("ID_cnt.pickle","rb") as fr:
        c_id = pickle.load(fr)    
        
    for client in df.client:
            if client in dic:
                c_id_lst.append(dic[client])
            else:
                c_id_lst.append(c_id)
                dic[client] = c_id
                c_id += 1
    df["ID"] = c_id_lst
                
    with open("ID_dictionary.pickle","wb") as fw:
        pickle.dump(dic, fw)
    with open("ID_cnt.pickle","wb") as fw:
        pickle.dump(c_id, fw)

In [5]:
df

Unnamed: 0,timestamp,client,AP,sec,ID
0,2014-01-01 00:00:24,b7f22f3abd9f367af65bd70b4b328e0c35f7cfac,Bldg11AP21,24.0,0
1,2014-01-01 00:00:30,336501aeaedff0462f3cd55089fa433dbbfff493,Bldg44AP3,30.0,1
2,2014-01-01 00:00:35,4b912f490ddc1f289f35def68a1662bcc5089a1a,Bldg48AP65,35.0,2
3,2014-01-01 00:00:40,8e10f9d5aa9c229baae2f745abd25b9c9522ed64,Bldg25AP19,40.0,3
4,2014-01-01 00:00:51,680fc4c2bd6b252fd6e1a67ad7aa69cdbd3d6e0f,Bldg25AP5,51.0,4
...,...,...,...,...,...
12724,2014-01-01 23:58:36,34a2fe9462c797cfc9916b7aa84bbe4a7b42327a,Bldg44AP15,86316.0,423
12725,2014-01-01 23:59:03,ec438383187d2fb4a00d4d7cd235494729d05198,Bldg44AP4,86343.0,42
12726,2014-01-01 23:59:14,2a9553acf404141b23ce2361d2d58e0ae7b72bb4,Bldg44AP64,86354.0,33
12727,2014-01-01 23:59:35,ca2a3933771ce90b5368dfdb95d1959f66fe0150,Bldg14AP12,86375.0,85


## AP -> AP_ID

다루기 쉽게 string으로 된 아이디를 숫자로 변환

In [None]:
df = pd.read_csv("traceset1/APlocations.csv")
df = df.fillna(0).astype({"floor":"int64"})
df["AP_ID"] = df.index
df.to_csv("traceset1/APlocations.csv", index=False)

## Access Point

ID로 저장된 Access Point를 x,y좌표가 적힌 텍스트 파일을 이용해 통합

In [None]:
#at preprocessing.py - add_location(df)