In [147]:
import os
import pandas as pd

In [148]:
# INSERT DIRECTORY HERE (CHANGE THIS)
TRAIN_FOLDER = './NG_NEW/Data/training/'
TEST_FOLDER = './NG_NEW/Data/validation/'

In [149]:
# Directories for Train
f_train = [x[0] for x in os.walk(TRAIN_FOLDER)][1:]
print('Train Folders:', f_train)
print()

# Directories for Test
f_test = [x[0] for x in os.walk(TEST_FOLDER)][1:]
print('Test Folders:', f_test)

Train Folders: ['./NG_NEW/Data/training/NGO--2021-08-25-14-54-34--LDL2-2', './NG_NEW/Data/training/NGO--2021-08-25-14-55-13_BElvl2-1', './NG_NEW/Data/training/NGO--2021-08-25-15-09-10--BElvl2-2', './NG_NEW/Data/training/NGO--2021-08-25-15-12-06--L2-JX-R2-Half', './NG_NEW/Data/training/NGO--2021-08-25-15-18-40_LDlvl2C-2', './NG_NEW/Data/training/NGO--2021-08-25-15-24-31_L2-JX-R1-Right', './NG_NEW/Data/training/NGO--2021-08-25-15-24-37_LDlvl2B-1', './NG_NEW/Data/training/NGO--2021-08-25-15-28-11_L2-JX-R2-Right', './NG_NEW/Data/training/NGO--2021-08-25-15-38-59_L2-JX-R1-D', './NG_NEW/Data/training/NGO--2021-08-25-15-49-37_L2-JX-R1-C', './NG_NEW/Data/training/NGO--2021-08-25-15-51-58_LDlvl2D-2', './NG_NEW/Data/training/NGO--2021-08-25-16-39-45_BElvl2-F', './NG_NEW/Data/training/NGO--2021-08-25-16-50-22_L2-JX-F', './NG_NEW/Data/training/NGO--2021-08-25-17-14-41_LDlvl2H2-2', './NG_NEW/Data/training/NGO--2021-08-25-17-44-09_LDlvlB1-1', './NG_NEW/Data/training/NGO--2021-08-25-17-48-57_B1-JX', 

# Wi-Fi Preprocessing
## Helper Functions

In [150]:
# Convert dataframe format from column: timestamp | mac | rssi 
# to dictionary timestamp : mac
def create_dict_timestamp_mac(timestamp, df):
    result = {}
    
    for t in timestamp:
        sub_df = df[df['timestamp'] == t]
        sub_df = sub_df.set_index('mac')
        dict_from_df = sub_df.to_dict()['rssi']
        
        # Remove space infront of mac addr
        keys = list(dict_from_df.keys())
        for k in keys:
            dict_from_df[k[1:]] = dict_from_df[k]
            del dict_from_df[k]
            
        keys = list(dict_from_df.keys())
        result[t] = [keys]
    
    return result

In [151]:
# Convert dictionary format timestamp : mac
# to dataframe column: unique_mac, row: one_hot, index: timestamp
def create_one_hot_mac_df(dt):
    df_of_keys = pd.DataFrame.from_dict(dt).T
    one_hot = pd.get_dummies(df_of_keys[0].apply(lambda x: pd.Series(1, x)) == 1)
    one_hot = one_hot * 1
    return one_hot

In [152]:
# Convert dataframe format from column: unique_mac, row: one_hot, index: timestamp
# to dataframe column: unique_mac, row: rssi, index: timestamp
def insert_rssi_to_one_hot(timestamp, df, one_hot):
    all_mac = set(one_hot.columns)
    
    for t in timestamp:
        sub_df = df[df['timestamp'] == t]
        sub_df = sub_df.set_index('mac')
        dict_from_df = sub_df.to_dict()['rssi']

        keys = list(dict_from_df.keys())
        for k in keys:
            dict_from_df[k[1:]] = dict_from_df[k]
            del dict_from_df[k]

        keys = list(dict_from_df.keys())
        for k in keys:
            one_hot.at[t, k] = dict_from_df[k]

        hot_mac = set(keys)
        cold_mac = all_mac - hot_mac
        cold_keys = list(cold_mac)

        for k in cold_keys:
            one_hot.at[t, k] = -100
            
    return one_hot

## Train Dataset Prep

For train_ap.csv:

| / | mac_1 | mac_2 | ... | mac_n |
| --- | --- | --- | --- | --- |
| 0 | rssi_1 | rssi_2 | ... | rssi_n |

- rssi = -100 means that particular AP is not detected for that timestamp

For train_main.csv:

| / | timestamp | x | y | type |
| --- | --- | --- | --- | --- |
| 0 | timestamp_1 | lat_1 | lng_1 | truth/pseudo |

In [153]:
train_ap = None
train_main = None
for i in range(len(f_train)):
    df = pd.read_csv(f_train[i] + '/wifi.csv', header=None)
    df_loc = pd.read_csv(f_train[i] + '/ground_truth.csv', header=None)
    df.columns = ['timestamp', 'mac', 'rssi']
    df_loc.columns = ['timestamp', 'latitude', 'longitude', 'type']
    timestamp = df['timestamp'].unique()
    dt = create_dict_timestamp_mac(timestamp, df)
    new_df = create_one_hot_mac_df(dt)
    new_df = insert_rssi_to_one_hot(timestamp, df, new_df)
    if("B1" in f_train[i]):
#         print("B1")
#         print(f_train[i])
        new_df["floor_id"] = -1
    elif ("lvl2" in f_train[i] or "Lvl2" in f_train[i] or "L2" in f_train[i]): 
#         print("L2")
#         print(f_train[i])
        new_df["floor_id"] = 2
    elif ("1" in f_train[i]):
#         print("L1")
#         print(f_train[i])   
        new_df["floor_id"] = 1

    if (i == 0):
        train_ap = new_df
        train_main = df_loc

    else: 
        train_ap = pd.concat([train_ap, new_df], axis=0)
        train_main = pd.concat([train_main, df_loc], axis=0)
train_ap = train_ap.fillna(-110)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




## Test Dataset Prep

In [155]:
test_ap = None
test_main = None

for i in range(len(f_test)):
    df = pd.read_csv(f_test[i] + '/wifi.csv', header=None)
    df_loc = pd.read_csv(f_test[i] + '/ground_truth.csv', header=None)
    df.columns = ['timestamp', 'mac', 'rssi']
    df_loc.columns = ['timestamp', 'latitude', 'longitude', 'type']
    timestamp = df['timestamp'].unique()
    dt = create_dict_timestamp_mac(timestamp, df)
    new_df = create_one_hot_mac_df(dt)
    new_df = insert_rssi_to_one_hot(timestamp, df, new_df)
    if("B1" in f_test[i]):
        new_df["floor_id"] = -1
    elif ("lvl2" in f_train[i] or "Lvl2" in f_train[i] or "L2" in f_train[i]): 
        new_df["floor_id"] = 2
    elif ("1" in f_test[i]):
        new_df["floor_id"] = 1
    if (i == 0):
        test_ap = new_df
        test_main = df_loc
    else: 
        test_ap = pd.concat([test_ap, new_df], axis=0)
        test_main = pd.concat([test_main, df_loc], axis=0)
    
test_ap = test_ap.fillna(-110)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [156]:
# Compare AP List for train & test
test_column_set = set(test_ap.columns)
train_column_set = set(train_ap.columns)
diff_column_set = test_column_set.symmetric_difference(train_column_set)
to_add_to_test = list(diff_column_set - test_column_set)
to_remove_from_test = list(diff_column_set - train_column_set)

In [157]:
# Add APs that is Present in train but Absent in test
for m in to_add_to_test:
    test_ap.insert(0, m, [-100] * len(test_ap.index), allow_duplicates=False)
    
# Drop APs that is Present in test but Absent in train
test_ap = test_ap.drop(to_remove_from_test, axis=1)

# Ensure that the Column Order of train and test in the Same
test_ap = test_ap[train_ap.columns] 

In [158]:
test_combined = pd.concat([test_ap,test_main], axis=0,ignore_index=True, join="outer")


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


## WAP tagging + rename column header 


In [159]:
test_mac = list(test_ap.columns.values.tolist())
train_mac = list(train_ap.columns.values.tolist())
test_mac.pop()
train_mac.pop()


ap_number = []
for i in range(len(test_mac)):
    ap_number.append(f"WAP{i}")


d = {"AP":ap_number, "MAC": train_mac}

ap_list = pd.DataFrame(data=d)
print(ap_list)

ap_list.to_csv(index=False, path_or_buf="./combined/AP_index.csv")

         AP                MAC
0      WAP0  00:1e:42:27:86:00
1      WAP1  00:1e:42:2d:25:66
2      WAP2  00:1e:42:2f:d8:64
3      WAP3  00:1e:42:34:2c:8d
4      WAP4  00:1e:42:35:b2:fa
..      ...                ...
842  WAP842  f0:41:c8:e0:43:a6
843  WAP843  f0:9f:c2:3a:f3:55
844  WAP844  f8:d0:27:78:ab:20
845  WAP845  fa:1d:10:cc:2f:dc
846  WAP846  fa:d0:27:68:51:9c

[847 rows x 2 columns]


In [162]:
dict1 = dict(zip(test_mac,ap_number))
test_combined = test_combined.rename(columns=dict1)
train_combined= train_combined.rename(columns=dict1)

In [163]:
test_combined.to_csv(index=False, path_or_buf='./combined/test_combined.csv')
train_combined.to_csv(index=False, path_or_buf='./combined/train_combined.csv')