In [1]:
import pandas as pd
import numpy as np
import glob
import os
import gc
import json 
base_path = '../input/indoor-location-navigation/'


In [2]:
# pull out all the buildings actually used in the test set, given current method we don't need the other ones
ssubm = pd.read_csv('../input/indoor-location-navigation/sample_submission.csv')

# only 24 of the total buildings are used in the test set, 
# this allows us to greatly reduce the intial size of the dataset

ssubm_df = ssubm["site_path_timestamp"].apply(lambda x: pd.Series(x.split("_")))
used_buildings = sorted(ssubm_df[0].value_counts().index.tolist())

# dictionary used to map the floor codes to the values used in the submission file. 
floor_map = {"B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2, "F4":3, "F5":4, "F6":5, "F7":6,"F8":7, "F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5, "7F":6, "8F": 7, "9F":8}

In [None]:
# # get only the wifi bssid that occur over 1000 times(this number can be experimented with)
# # these will be the only ones used when constructing features
# bssid = dict()

# for building in used_buildings:
# #     break
#     folders = sorted(glob.glob(os.path.join(base_path,'train/'+building+'/*')))
#     print(building)
#     wifi = list()
#     for folder in folders:
#         floor = floor_map[folder.split('/')[-1]]
#         files = glob.glob(os.path.join(folder, "*.txt"))
#         for file in files:
#             with open(file) as f:
#                 txt = f.readlines()
#                 for e, line in enumerate(txt):
#                     tmp = line.strip().split()
#                     if tmp[1] == "TYPE_WIFI":
#                         wifi.append(tmp)
#     df = pd.DataFrame(wifi)
#     #top_bssid = df[3].value_counts().iloc[:500].index.tolist()
#     value_counts = df[3].value_counts()
#     top_bssid = value_counts[value_counts >= 0].index.tolist()
#     print(len(top_bssid))
#     bssid[building] = top_bssid
#     del df
#     del wifi
#     gc.collect()

In [3]:
# with open("bssid.json", "w") as f:
#     json.dump(bssid, f)

with open("bssid.json") as f:
    bssid = json.load(f)

In [4]:
import re
def multi_line_spliter(s):
    matches = re.finditer("TYPE_", s)
    matches_positions = [match.start() for match in matches]
    split_idx = [0] + [matches_positions[i]-14 for i in range(1, len(matches_positions))] + [len(s)]
    return [s[split_idx[i]:split_idx[i+1]] for i in range(len(split_idx)-1)]
    
    
def load_df(file):
    #path = str(Path(self.input_path)/f"train/{self.site_id}/{self.floor}/{self.path_id}.txt")
    with open(file) as f:
        data = f.readlines()

#     modified_data = []
#     for s in data:
#         if s.count("TYPE_")>1:
#             lines = multi_line_spliter(s)
#             modified_data.extend(lines)
#         else:
#             modified_data.append(s)
#     del data
#     return modified_data
    return data

In [5]:
from dataclasses import dataclass

import numpy as np


@dataclass
class ReadData:
    acce: np.ndarray
    acce_uncali: np.ndarray
    gyro: np.ndarray
    gyro_uncali: np.ndarray
    magn: np.ndarray
    magn_uncali: np.ndarray
    ahrs: np.ndarray
    wifi: np.ndarray
    ibeacon: np.ndarray
    waypoint: np.ndarray


def read_data_file(data_filename):
    acce = []
    acce_uncali = []
    gyro = []
    gyro_uncali = []
    magn = []
    magn_uncali = []
    ahrs = []
    wifi = []
    ibeacon = []
    waypoint = []

    with open(data_filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line_data in lines:
        line_data = line_data.strip()
        if not line_data or line_data[0] == '#':
            continue

        line_data = line_data.split('\t')

        if line_data[1] == 'TYPE_ACCELEROMETER':
            acce.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ACCELEROMETER_UNCALIBRATED':
            acce_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE':
            gyro.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_GYROSCOPE_UNCALIBRATED':
            gyro_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD':
            magn.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_MAGNETIC_FIELD_UNCALIBRATED':
            magn_uncali.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_ROTATION_VECTOR':
            ahrs.append([int(line_data[0]), float(line_data[2]), float(line_data[3]), float(line_data[4])])
            continue

        if line_data[1] == 'TYPE_WIFI':
            sys_ts = line_data[0]
            ssid = line_data[2]
            bssid = line_data[3]
            rssi = line_data[4]
            lastseen_ts = line_data[6]
            wifi_data = [sys_ts, ssid, bssid, rssi, lastseen_ts]
            wifi.append(wifi_data)
            continue

        if line_data[1] == 'TYPE_BEACON':
            ts = line_data[0]
            uuid = line_data[2]
            major = line_data[3]
            minor = line_data[4]
            rssi = line_data[6]
            ibeacon_data = [ts, '_'.join([uuid, major, minor]), rssi]
            ibeacon.append(ibeacon_data)
            continue

        if line_data[1] == 'TYPE_WAYPOINT':
            waypoint.append([int(line_data[0]), float(line_data[2]), float(line_data[3])])

    acce = np.array(acce)
    acce_uncali = np.array(acce_uncali)
    gyro = np.array(gyro)
    gyro_uncali = np.array(gyro_uncali)
    magn = np.array(magn)
    magn_uncali = np.array(magn_uncali)
    ahrs = np.array(ahrs)
    wifi = np.array(wifi)
    ibeacon = np.array(ibeacon)
    waypoint = np.array(waypoint)

    return ReadData(acce, acce_uncali, gyro, gyro_uncali, magn, magn_uncali, ahrs, wifi, ibeacon, waypoint)


In [6]:
# generate all the training data 
# used_buildings[:1]
for building in used_buildings:
    #break
    folders = sorted(glob.glob(os.path.join(base_path,'train', building +'/*')))
    dfs = list()
    index = sorted(bssid[building])
    print(building)
    building_df_wifi = []
    building_df_waypoint = []
    for folder in folders:
        floor = floor_map[folder.split('/')[-1]]
        files = glob.glob(os.path.join(folder, "*.txt"))
        print(floor)
        for file in files:
            data = read_data_file(file)
            if len(data.wifi)>0:
                wifi_data = pd.DataFrame(data.wifi)
                wifi_data.columns = ['ts_wifi','ssid','bssid','rssi','ts_wifi_ls']
                wifi_data['path'] = file.split('/')[-1].split('.')[0]
                wifi_data['site'] = file.split('/')[-3]
                wifi_data['floor'] = floor
                wifi_data['floor_ori'] = folder.split('/')[-1]
                building_df_wifi.append(wifi_data) 
            if len(data.waypoint)>0:
                waypoint_data = pd.DataFrame(data.waypoint)
                waypoint_data.columns = ['ts_waypoint','x','y']
                waypoint_data['path'] = file.split('/')[-1].split('.')[0]
                waypoint_data['site'] = file.split('/')[-3]
                waypoint_data['floor'] = floor
                waypoint_data['floor_ori'] = folder.split('/')[-1]
                building_df_waypoint.append(waypoint_data)             
    building_df_wifi = pd.concat(building_df_wifi).reset_index(drop=True)
    building_df_waypoint = pd.concat(building_df_waypoint).reset_index(drop=True)
    building_df_wifi.to_csv('../input/data_abstract/'+building+"_train_wifi.csv")
    building_df_waypoint.to_csv('../input/data_abstract/'+building+"_train_waypoint.csv")
    
    

5a0546857ecc773753327266
-1
0
1


KeyboardInterrupt: 

In [9]:
ssubm_df.head(2)

Unnamed: 0,0,1,2
0,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9
1,5a0546857ecc773753327266,046cfa46be49fc10834815c6,9017


In [10]:
ssubm_building_g = ssubm_df.groupby(0)
feature_dict = dict()

for gid0, g0 in ssubm_building_g:
    index = sorted(bssid[g0.iloc[0,0]])
    feats = list()
    print(gid0)
    building_df_wifi = []
    for gid,g in g0.groupby(1):

        # get all wifi time locations
        #with open(os.path.join(base_path, 'test/' + g.iloc[0,1] + '.txt')) as f:
            #txt = f.readlines()
        data = read_data_file(os.path.join(base_path, 'test/' + g.iloc[0,1] + '.txt'))
        if len(data.wifi)>0:
            wifi_data = pd.DataFrame(data.wifi)
            wifi_data.columns = ['ts_wifi','ssid','bssid','rssi','ts_wifi_ls']
            wifi_data['path'] = g.iloc[0,1]
            wifi_data['site'] = gid0
            building_df_wifi.append(wifi_data)            
    building_df_wifi = pd.concat(building_df_wifi).reset_index(drop=True)
    building_df_wifi.to_csv('../input/data_abstract/'+gid0+"_test_wifi.csv")
    

5a0546857ecc773753327266
5c3c44b80379370013e0fd2b
5d27075f03f801723c2e360f
5d27096c03f801723c31e5e0
5d27097f03f801723c320d97
5d27099f03f801723c32511d
5d2709a003f801723c3251bf
5d2709b303f801723c327472
5d2709bb03f801723c32852c
5d2709c303f801723c3299ee
5d2709d403f801723c32bd39
5d2709e003f801723c32d896
5da138274db8ce0c98bbd3d2
5da1382d4db8ce0c98bbe92e
5da138314db8ce0c98bbf3a0
5da138364db8ce0c98bc00f1
5da1383b4db8ce0c98bc11ab
5da138754db8ce0c98bca82f
5da138764db8ce0c98bcaa46
5da1389e4db8ce0c98bd0547
5da138b74db8ce0c98bd4774
5da958dd46f8266d0737457b
5dbc1d84c1eb61796cf7c010
5dc8cea7659e181adb076a3f


In [45]:
building_df_wifi

Unnamed: 0,ts_wifi,ssid,bssid,rssi,ts_wifi_ls,path,site
0,0000000002340,da39a3ee5e6b4b0d3255bfef95601890afd80709,eebf5db207eec2f3e041f92153d789270f346821,-45,1578474544726,046cfa46be49fc10834815c6,5a0546857ecc773753327266
1,0000000002340,b9f0208be00bd8b337be7f12e02e3a3ce846e22b,7805f319f3f591986effe78c5b41143180278f2d,-46,1578474565732,046cfa46be49fc10834815c6,5a0546857ecc773753327266
2,0000000002340,ab150ecf6d972b476aeab16317bed6189d9f7cce,323607d8444900d64151ee06d164738ac727bbce,-46,1578474564279,046cfa46be49fc10834815c6,5a0546857ecc773753327266
3,0000000002340,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,b26914599f6d9ba16b43975394e1eeb9d82f4bab,-47,1578474565725,046cfa46be49fc10834815c6,5a0546857ecc773753327266
4,0000000002340,da39a3ee5e6b4b0d3255bfef95601890afd80709,02a1be3a5dab38320f879489d8a1e0f2a72768b3,-47,1578474547962,046cfa46be49fc10834815c6,5a0546857ecc773753327266
...,...,...,...,...,...,...,...
338901,0000000067545,b6ffe5619e02871fcd04f61c9bb4b5c53a3f46b7,f2fd7c8b3ae74a54ebcd5498b81b513b7c5e564a,-90,1578465380606,ffcd9524c80c0fa5bb859eaf,5a0546857ecc773753327266
338902,0000000067545,b9f0208be00bd8b337be7f12e02e3a3ce846e22b,94887049b5d6072ffd22a5e7de70523931861c2b,-91,1578465380654,ffcd9524c80c0fa5bb859eaf,5a0546857ecc773753327266
338903,0000000067545,b7e6027447eb1f81327d66cfd3adbe557aabf26c,e9f5c01efe9058d460ed3830b2a23b729dea930a,-92,1578465380607,ffcd9524c80c0fa5bb859eaf,5a0546857ecc773753327266
338904,0000000067545,02eb66d35bce69814f108c2f876e600a78ace137,0f5daed11a61e0d6941a1a42ff428ca216d61003,-93,1578465370203,ffcd9524c80c0fa5bb859eaf,5a0546857ecc773753327266
