# Mobility data converter

## Import modules

In [1]:
import sys
sys.path.insert(1, '../src/utils/')
from my_utils import *

import pandas as pd
import numpy as np

In [2]:
# A faltar: time_index, ids das rsus

## Introduction

The goal is to convert data into the following format:

In [3]:
df_example = pd.read_csv('../data/processed/example_5nodes.csv')
df_example.head(2)

Unnamed: 0,timestamp,node_id,latitude,longitude,neighbours,rsu
0,1423807260,101,40.641606,-8.652702,[{'node_id': '102'| 'rssi': '46'| 'rsu': 0}],1
1,1423807260,102,40.641211,-8.653796,[{'node_id': '101'| 'rssi': '46'| 'rsu': 1}|{'...,0


## Variables

In [4]:
# columns from the original dataframe
cols = ['time_index', 'entity_id', 'location', 'speed', 'heading', 'altitude', 'rssi', 'obu_id',
        'road', 'class', 'test', 'receiverid', 'receivertype', 'stationid', 'stationtype',
        'semimajorconf', 'semiminorconf', 'semimajororient', 'altitudeconf', 'headingconf',
        'speedconf', 'vehiclerole', 'drivedirection', 'length', 'width', 'acceleration',
        'accelerationconf', 'curvature', 'curvatureconf', 'yawrate', 'yawrateconf', 'brakepedal',
        'gaspedal', 'emergencybrake', 'collisionwarning', 'accengaged', 'cruisecontrol',
        'speedlimiter', 'specialvehicle']

# selected features
features = ['time_index', 'entity_id', 'location', 'rssi', 'stationid']

In [5]:
# selected id's
rsu_ids = ['p3', 'p5', 'p6', 'p26', 'p19']
obu_ids = [50, 51, 52, 60, 86, 87, 89, 90, 97, 99]

In [6]:
rsu_locations = {'p3':(40.64074, -8.65705), 'p5':(40.64088, -8.65397), 'p6':(40.64161, -8.652827), 'p26':(40.63848, -8.65147), 'p19':(40.64339, -8.65847)}
rsu_locations

{'p3': (40.64074, -8.65705),
 'p5': (40.64088, -8.65397),
 'p6': (40.64161, -8.652827),
 'p26': (40.63848, -8.65147),
 'p19': (40.64339, -8.65847)}

In [7]:
# converts short id's to long id's
rsu_ids = build_posts_ids(rsu_ids)
rsu_ids

['urn:ngsi-ld:Values:aveiro_cam:p3',
 'urn:ngsi-ld:Values:aveiro_cam:p5',
 'urn:ngsi-ld:Values:aveiro_cam:p6',
 'urn:ngsi-ld:Values:aveiro_cam:p26',
 'urn:ngsi-ld:Values:aveiro_cam:p19']

In [8]:
# selected time period
data_init = '2022-03-07 09:25:00.00+00'
data_end = '2022-03-07 09:30:00.00+00'

# 9:25 - 9h30
# 8:35 - 8h40

In [9]:
# file containing the raw data
filename = '../data/raw/cams.part.01.csv'
# file for the processed data
filename_processed = '../data/processed/data_9:25_9:30.csv'

In [10]:
# radius (meters) for two entities being considered neighbours
radius = 50
# time for resample
time = '5s'

## Loading data

In [11]:
df = pd.read_csv(filename, names=cols) # read file

In [12]:
df['rssi'].describe([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.98])

count    1000000.000000
mean      -17322.096087
std        28832.587239
min       -65535.000000
10%       -65535.000000
20%       -65535.000000
30%          -88.000000
40%          -86.000000
50%          -84.000000
60%          -82.000000
70%          -79.000000
80%          -74.000000
90%          -67.000000
98%          -63.000000
max            0.000000
Name: rssi, dtype: float64

## Converting data

In [13]:
df = df[features] # select columns
df = df.loc[df['entity_id'].isin(rsu_ids)] # select only relevant rsu's
df = df.loc[df['stationid'].isin(obu_ids)] # select only relevant obu's
df = df.loc[(df['time_index']>=data_init) & (df['time_index']<data_end)] # include data_init and exclude data_end
df = df.loc[(df['rssi']!=0)] # filter by rssi values
df = df.sort_values(by='time_index')
df['time_index'] = pd.to_datetime(df['time_index'])
df["time_index"] = df["time_index"].dt.round(time) # round time_index columns
df = df.reset_index(drop=True) # reset index

df.head(2)

Unnamed: 0,time_index,entity_id,location,rssi,stationid
0,2022-03-07 09:25:00+00:00,urn:ngsi-ld:Values:aveiro_cam:p6,0101000000E1BB28D5994E21C0EB234B4112524440,-81,60
1,2022-03-07 09:25:00+00:00,urn:ngsi-ld:Values:aveiro_cam:p5,0101000000E1BB28D5994E21C0EB234B4112524440,-82,60


In [14]:
df = df.groupby(['time_index', 'entity_id', 'stationid'], as_index=False).agg({'location':'first', 'rssi':'mean'})
df

Unnamed: 0,time_index,entity_id,stationid,location,rssi
0,2022-03-07 09:25:00+00:00,urn:ngsi-ld:Values:aveiro_cam:p5,60,0101000000E1BB28D5994E21C0EB234B4112524440,-82.666667
1,2022-03-07 09:25:00+00:00,urn:ngsi-ld:Values:aveiro_cam:p6,60,0101000000E1BB28D5994E21C0EB234B4112524440,-80.333333
2,2022-03-07 09:25:05+00:00,urn:ngsi-ld:Values:aveiro_cam:p5,60,0101000000EE10B5238B4E21C080B50F1E12524440,-84.75
3,2022-03-07 09:25:05+00:00,urn:ngsi-ld:Values:aveiro_cam:p6,60,0101000000EE10B5238B4E21C080B50F1E12524440,-76.166667
4,2022-03-07 09:25:10+00:00,urn:ngsi-ld:Values:aveiro_cam:p6,60,0101000000DB954C04604E21C0F6662F3608524440,-77.0
5,2022-03-07 09:25:15+00:00,urn:ngsi-ld:Values:aveiro_cam:p6,60,0101000000BF709D352E4E21C07D9175DD00524440,-79.0
6,2022-03-07 09:25:25+00:00,urn:ngsi-ld:Values:aveiro_cam:p5,60,01010000007AC2B755B54D21C0B14DCF60F8514440,-86.0
7,2022-03-07 09:25:40+00:00,urn:ngsi-ld:Values:aveiro_cam:p26,60,0101000000D739611D224D21C00816D1C0D9514440,-85.666667
8,2022-03-07 09:25:45+00:00,urn:ngsi-ld:Values:aveiro_cam:p26,60,0101000000800640811C4D21C00D26B49BCF514440,-77.8
9,2022-03-07 09:25:50+00:00,urn:ngsi-ld:Values:aveiro_cam:p26,60,0101000000EF512404504D21C0652EBA06C7514440,-76.0


In [15]:
df['gps'] = df['location'].map(convert_wkb_to_lat_lon) # convert location to gps
for n, col in enumerate(['latitude', 'longitude']): # extract lat, lon from gps
    df[col] = df['gps'].apply(lambda location: location[n])
df.head(2)

Unnamed: 0,time_index,entity_id,stationid,location,rssi,gps,latitude,longitude
0,2022-03-07 09:25:00+00:00,urn:ngsi-ld:Values:aveiro_cam:p5,60,0101000000E1BB28D5994E21C0EB234B4112524440,-82.666667,"(40.6411821, -8.6535174)",40.641182,-8.653517
1,2022-03-07 09:25:00+00:00,urn:ngsi-ld:Values:aveiro_cam:p6,60,0101000000E1BB28D5994E21C0EB234B4112524440,-80.333333,"(40.6411821, -8.6535174)",40.641182,-8.653517


In [16]:
df = df.drop(columns=['location'])
df.head(2)

Unnamed: 0,time_index,entity_id,stationid,rssi,gps,latitude,longitude
0,2022-03-07 09:25:00+00:00,urn:ngsi-ld:Values:aveiro_cam:p5,60,-82.666667,"(40.6411821, -8.6535174)",40.641182,-8.653517
1,2022-03-07 09:25:00+00:00,urn:ngsi-ld:Values:aveiro_cam:p6,60,-80.333333,"(40.6411821, -8.6535174)",40.641182,-8.653517


In [17]:
# add rsu location
df['gps_rsu'] = df['entity_id'].str.split(':', expand=True)[4].apply(lambda pid: rsu_locations.get(pid))
df['entity_id'] = df['entity_id'].str.split(':', expand=True)[4].str[1:].astype(int) + 100
df['stationid'] = df['stationid'] + 100
for n, col in enumerate(['latitude_rsu', 'longitude_rsu']): # extract lat, lon from gps
    df[col] = df['gps_rsu'].apply(lambda location: location[n])
df

Unnamed: 0,time_index,entity_id,stationid,rssi,gps,latitude,longitude,gps_rsu,latitude_rsu,longitude_rsu
0,2022-03-07 09:25:00+00:00,105,160,-82.666667,"(40.6411821, -8.6535174)",40.641182,-8.653517,"(40.64088, -8.65397)",40.64088,-8.65397
1,2022-03-07 09:25:00+00:00,106,160,-80.333333,"(40.6411821, -8.6535174)",40.641182,-8.653517,"(40.64161, -8.652827)",40.64161,-8.652827
2,2022-03-07 09:25:05+00:00,105,160,-84.75,"(40.6411779, -8.6534053)",40.641178,-8.653405,"(40.64088, -8.65397)",40.64088,-8.65397
3,2022-03-07 09:25:05+00:00,106,160,-76.166667,"(40.6411779, -8.6534053)",40.641178,-8.653405,"(40.64161, -8.652827)",40.64161,-8.652827
4,2022-03-07 09:25:10+00:00,106,160,-77.0,"(40.6408756, -8.6530763)",40.640876,-8.653076,"(40.64161, -8.652827)",40.64161,-8.652827
5,2022-03-07 09:25:15+00:00,106,160,-79.0,"(40.6406514, -8.6526963)",40.640651,-8.652696,"(40.64161, -8.652827)",40.64161,-8.652827
6,2022-03-07 09:25:25+00:00,105,160,-86.0,"(40.6403924, -8.6517741)",40.640392,-8.651774,"(40.64088, -8.65397)",40.64088,-8.65397
7,2022-03-07 09:25:40+00:00,126,160,-85.666667,"(40.6394578, -8.6506509)",40.639458,-8.650651,"(40.63848, -8.65147)",40.63848,-8.65147
8,2022-03-07 09:25:45+00:00,126,160,-77.8,"(40.6391482, -8.6506081)",40.639148,-8.650608,"(40.63848, -8.65147)",40.63848,-8.65147
9,2022-03-07 09:25:50+00:00,126,160,-76.0,"(40.6388863, -8.6510011)",40.638886,-8.651001,"(40.63848, -8.65147)",40.63848,-8.65147


In [18]:
def rsu_to_node_id(df):
    _df = df.copy()
    _df = _df[['time_index', 'entity_id', 'latitude_rsu', 'longitude_rsu']]
    _df = _df.rename(columns={'entity_id':'node_id', 'latitude_rsu': 'latitude', 'longitude_rsu': 'longitude'})
    _df['neighbours'] = df.apply(lambda x : [(x.stationid, x.rssi, 0)], axis=1)
    _df['rsu'] = [1]*len(_df.index)
    _df = _df.groupby(['time_index', 'node_id', 'latitude', 'longitude'], as_index=False).agg({'neighbours': 'sum', 'rsu': 'first'})
    # 'neighbours': 'sum' -> sum of lists
    return _df

In [19]:
df_rsus = rsu_to_node_id(df)
df_rsus.head()

Unnamed: 0,time_index,node_id,latitude,longitude,neighbours,rsu
0,2022-03-07 09:25:00+00:00,105,40.64088,-8.65397,"[(160, -82.66666666666667, 0)]",1
1,2022-03-07 09:25:00+00:00,106,40.64161,-8.652827,"[(160, -80.33333333333333, 0)]",1
2,2022-03-07 09:25:05+00:00,105,40.64088,-8.65397,"[(160, -84.75, 0)]",1
3,2022-03-07 09:25:05+00:00,106,40.64161,-8.652827,"[(160, -76.16666666666667, 0)]",1
4,2022-03-07 09:25:10+00:00,106,40.64161,-8.652827,"[(160, -77.0, 0)]",1


In [20]:
def obu_to_node_id(df):
    _df = df.copy()
    _df = _df[['time_index', 'stationid', 'latitude', 'longitude']]
    _df = _df.rename(columns={'stationid':'node_id'})
    _df['neighbours'] = df.apply(lambda x : [(x.entity_id, x.rssi, 1)], axis=1)
    _df['rsu'] = [0]*len(_df.index)
    _df = _df.groupby(['time_index', 'node_id', 'latitude', 'longitude'], as_index=False).agg({'neighbours': 'sum', 'rsu': 'first'})
    # 'neighbours': 'sum' -> sum of lists
    return _df

In [21]:
df_obus = obu_to_node_id(df)
df_obus.head()

Unnamed: 0,time_index,node_id,latitude,longitude,neighbours,rsu
0,2022-03-07 09:25:00+00:00,160,40.641182,-8.653517,"[(105, -82.66666666666667, 1), (106, -80.33333...",0
1,2022-03-07 09:25:05+00:00,160,40.641178,-8.653405,"[(105, -84.75, 1), (106, -76.16666666666667, 1)]",0
2,2022-03-07 09:25:10+00:00,160,40.640876,-8.653076,"[(106, -77.0, 1)]",0
3,2022-03-07 09:25:15+00:00,160,40.640651,-8.652696,"[(106, -79.0, 1)]",0
4,2022-03-07 09:25:25+00:00,160,40.640392,-8.651774,"[(105, -86.0, 1)]",0


In [22]:
# Concatenate df_rsus with df_obus
df_nodes = pd.concat([df_rsus, df_obus])
df_nodes = df_nodes.sort_values(by='time_index')
df_nodes.head()

Unnamed: 0,time_index,node_id,latitude,longitude,neighbours,rsu
0,2022-03-07 09:25:00+00:00,105,40.64088,-8.65397,"[(160, -82.66666666666667, 0)]",1
1,2022-03-07 09:25:00+00:00,106,40.64161,-8.652827,"[(160, -80.33333333333333, 0)]",1
0,2022-03-07 09:25:00+00:00,160,40.641182,-8.653517,"[(105, -82.66666666666667, 1), (106, -80.33333...",0
2,2022-03-07 09:25:05+00:00,105,40.64088,-8.65397,"[(160, -84.75, 0)]",1
3,2022-03-07 09:25:05+00:00,106,40.64161,-8.652827,"[(160, -76.16666666666667, 0)]",1


In [23]:
def list_to_neighbour_format(l):
    txt = "["
    for i in range(len(l)):
        t = l[i]
        txt += "{'node_id': '" + str(t[0]) + "'| 'rssi': '" + str(round(abs(t[1]))) + "'| 'rsu': " + str(t[2]) + "}"
        if i<len(l)-1:
            txt += ","
        
    txt += "]"
    return txt

In [24]:
df_nodes['neighbours'] = df_nodes['neighbours'].apply(lambda x :list_to_neighbour_format(x))
df_nodes.head()

Unnamed: 0,time_index,node_id,latitude,longitude,neighbours,rsu
0,2022-03-07 09:25:00+00:00,105,40.64088,-8.65397,[{'node_id': '160'| 'rssi': '83'| 'rsu': 0}],1
1,2022-03-07 09:25:00+00:00,106,40.64161,-8.652827,[{'node_id': '160'| 'rssi': '80'| 'rsu': 0}],1
0,2022-03-07 09:25:00+00:00,160,40.641182,-8.653517,"[{'node_id': '105'| 'rssi': '83'| 'rsu': 1},{'...",0
2,2022-03-07 09:25:05+00:00,105,40.64088,-8.65397,[{'node_id': '160'| 'rssi': '85'| 'rsu': 0}],1
3,2022-03-07 09:25:05+00:00,106,40.64161,-8.652827,[{'node_id': '160'| 'rssi': '76'| 'rsu': 0}],1


In [25]:
df_nodes['timestamp'] = df_nodes['time_index'].astype(np.int64) // 10 ** 9
df_nodes = df_nodes[['timestamp', 'node_id', 'latitude', 'longitude', 'neighbours', 'rsu']]
df_nodes.head()

Unnamed: 0,timestamp,node_id,latitude,longitude,neighbours,rsu
0,1646645100,105,40.64088,-8.65397,[{'node_id': '160'| 'rssi': '83'| 'rsu': 0}],1
1,1646645100,106,40.64161,-8.652827,[{'node_id': '160'| 'rssi': '80'| 'rsu': 0}],1
0,1646645100,160,40.641182,-8.653517,"[{'node_id': '105'| 'rssi': '83'| 'rsu': 1},{'...",0
2,1646645105,105,40.64088,-8.65397,[{'node_id': '160'| 'rssi': '85'| 'rsu': 0}],1
3,1646645105,106,40.64161,-8.652827,[{'node_id': '160'| 'rssi': '76'| 'rsu': 0}],1


In [26]:
df_nodes.node_id.unique()

array([105, 106, 160, 126, 152, 197, 199])

## Saving data

In [27]:
df_nodes.to_csv(filename_processed)