In [1]:
import configparser

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import os

In [3]:
config_parser = configparser.ConfigParser()
config_parser.read("conf.txt")
PATH = config_parser['PATH']['path']

In [4]:
train_path = os.path.join(PATH, "train")
test_path = os.path.join(PATH, "test")
meta_path = os.path.join(PATH, "metadata")

The train data is divided into multiple building parts. Each part contains several floors:

```train -> building_A -> floor_F2 -> [signal_A.txt, signal_B.txt, ...]```

The accompaning meta data belongs is divided in a similar manner:

```metadata -> building_A -> floor_F2 -> [floor_image.png, floor_info.json, geojson_map.json]```

The test data however is not partioned like that: the records are accessible directly:

```test -> [signal_A.txt, signal_B.txt, ...]```

In [5]:
building_sess = os.listdir(train_path)
print(f"There are total {len(building_sess)} building sessions in the train set")

There are total 204 building sessions in the train set


In [6]:
# pick a sample and analyse the data:
n = 101
assert n < len(building_sess)
sample_sess_path = os.path.join(train_path, building_sess[n])

In [7]:
floors_map = {"B3":-3,"B2":-2,"B1":-1,
              "F1":0,"1F":0,"F2":1,"2F":1,
              "F3":2,"3F":2,"F4":3,"4F":3, 
              "F5":4,"5F":4,"F6":5,"6F":5,
              "F7":6,"7F":6,"F8":7,"8F": 7,
              "F9":8,"9F":8,"F10":9}

In [8]:
col_names = ["ts", "sensor_type", "v1", "v2", "v3", "v4", "v5", "v6", "v7"]

In [9]:
def get_floors(building_full_path):
    # B before F, 1 before 2
    return sorted(os.listdir(sample_sess_path))

In [10]:
def get_line_num_valid(path):
    file = open(path, "r")
    line_num = 0
    while(True):
        line = file.readline()
        if(line.startswith("#")):
            line_num += 1
            continue
        return line_num


def read_file_as_df(path):
    valid_line_start = get_line_num_valid(path)
    ## TODO: set dtype={"col_name": int, } for the columns here, since they are mixed
    df = pd.read_csv(path, sep='\t', skiprows=valid_line_start, names=col_names)
    # remove lines that start with '#'
    df = df[df.ts != '#']
    return df


Read one sample into dataframe

In [11]:
### TODO Try and read some sample here

In [12]:
def get_data_per_floor(site_id_path, floor):
    floors = get_floors(site_id_path)
    if floor not in floors:
        raise Exception("No such floor at this site")
    df = pd.DataFrame()
    path = os.path.join(site_id_path, floor)
    sessions = os.listdir(path)
    for session in sessions:
        full_path = os.path.join(path, session)
        df_floor = read_file_as_df(full_path)
        # add the session id
        df_floor["session_id"] = session
        df = df.append(df_floor)
    return df

In [13]:
sample_floors = get_floors(sample_sess_path)
print(f"Available floors for {building_sess[n]}: {sample_floors}")
sample_floor = sample_floors[2]
# this might take some time
sample_floor_df = get_data_per_floor(sample_sess_path, sample_floor)
sample_floor_df['floor'] = sample_floor

Available floors for 5dbc1d84c1eb61796cf7c010: ['B1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8']


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
# df with all the records for one floor
sample_floor_df.head(3)

Unnamed: 0,ts,sensor_type,v1,v2,v3,v4,v5,v6,v7,session_id,floor
0,1574143344486,TYPE_WAYPOINT,158.0061,123.61419,,,,,,5dd3901e27889b0006b76adc.txt,F3
1,1574143344608,TYPE_ACCELEROMETER,-0.3776703,1.3677368,7.491226,2.0,,,,5dd3901e27889b0006b76adc.txt,F3
2,1574143344608,TYPE_MAGNETIC_FIELD,-24.937439,0.592041,-34.19037,3.0,,,,5dd3901e27889b0006b76adc.txt,F3


In [15]:
# View the different sensors used for this floor
pd.unique(sample_floor_df.sensor_type)

array(['TYPE_WAYPOINT', 'TYPE_ACCELEROMETER', 'TYPE_MAGNETIC_FIELD',
       'TYPE_GYROSCOPE', 'TYPE_ROTATION_VECTOR',
       'TYPE_MAGNETIC_FIELD_UNCALIBRATED', 'TYPE_GYROSCOPE_UNCALIBRATED',
       'TYPE_ACCELEROMETER_UNCALIBRATED', 'TYPE_BEACON', 'TYPE_WIFI'],
      dtype=object)