# API QoS Estimation

In [1]:
#First we import the requested modules
import pandas as pd
from pandas.io.json import json_normalize
import json

import math
import numpy as np

import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
pio.templates.default = 'plotly_white'
pd.set_option("display.precision", 3)

import datetime
from datetime import timedelta

from pandarallel import pandarallel
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
pandarallel.initialize()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
#Available colors
colors = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
]

In [3]:
# FUNCTIONS
def str_to_int(string) :
    final_val = 0
    for c in string:
        val = ord(c)
        final_val += val
    return final_val

In [4]:
f = '../Data/'

In [16]:
#Load line_stops_dict
with open(f+'Static/lines_dict.json', 'r') as file:
    lines_dict = json.load(file)

In [17]:
#lines_dict['25']['2']['stops']

## Last week's data

In [55]:
#Read week df
week_df = pd.read_csv(f+'RealTime/buses_data_week_cleaned.csv',
    dtype={
        'line': 'uint16',
        'direction': 'uint16',
        'stop': 'str',
        'bus': 'str',
        'estimateArrive': 'uint16'
    }
)[['line','direction','stop','bus','datetime','estimateArrive']]

#Parse the dates
week_df['datetime'] = pd.to_datetime(week_df['datetime'], errors = 'coerce', format='%Y-%m-%dT%H:%M:%S.%f')

In [56]:
week_df.head()

Unnamed: 0,line,direction,stop,bus,datetime,estimateArrive
0,18,1,490005584K,BF67GMY,2021-01-30 18:54:41.856658700,1095
1,18,1,490005584K,BF67GMY,2021-01-30 18:54:41.856658700,1095
2,18,1,490005584K,LJ66TSU,2021-01-30 18:54:41.856658700,1575
3,18,1,490005584K,BF67GMY,2021-01-30 18:54:41.856658700,1095
4,18,1,490005584K,BF67GLY,2021-01-30 18:54:41.856658700,591


In [57]:
week_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545178 entries, 0 to 545177
Data columns (total 6 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   line            545178 non-null  uint16        
 1   direction       545178 non-null  uint16        
 2   stop            545178 non-null  object        
 3   bus             545178 non-null  object        
 4   datetime        545178 non-null  datetime64[ns]
 5   estimateArrive  545178 non-null  uint16        
dtypes: datetime64[ns](1), object(2), uint16(3)
memory usage: 15.6+ MB


# Analysis of temporal series belonging to a bus
Analyze all the data corresponding to the different trips of a bus to a stop.
We pay attention to the last TH values of the series.

In [58]:
#Number of last ocurrences which form the series we are going to analyze for QoS
TH = 30

In [59]:
th_df = week_df.sort_values(by=['bus','stop','datetime'], ascending = True)
th_df = th_df.drop_duplicates(['bus','stop','datetime'], keep = 'last')
th_df = th_df[th_df.datetime > th_df.datetime.max() - timedelta(seconds=900)]
th_df.tail(5)

Unnamed: 0,line,direction,stop,bus,datetime,estimateArrive
537480,25,1,490013195H,SK20BDV,2021-02-06 18:47:38.795904200,171
538699,25,1,490013195H,SK20BDV,2021-02-06 18:48:38.227376600,111
539706,25,1,490013195H,SK20BDV,2021-02-06 18:49:38.818081000,32
540332,25,1,490013195H,SK20BDV,2021-02-06 18:50:09.027057100,33
540960,25,1,490013195H,SK20BDV,2021-02-06 18:50:39.926769000,3


In [60]:
th_df[th_df.line == 18].bus.unique()

array(['BF67GKE', 'BF67GKG', 'BF67GKN', 'BF67GKP', 'BF67GKU', 'BF67GKV',
       'BF67GKX', 'BF67GKY', 'BF67GKZ', 'BF67GLK', 'BF67GLY', 'BF67GLZ',
       'BF67GME', 'BF67GMO', 'BF67GMU', 'BF67GMV', 'BF67GMZ', 'BF67GNJ',
       'BF67GNK', 'BF67GNN', 'LJ15JYZ', 'LJ15JZA', 'LJ15JZC', 'LJ15JZD',
       'LJ16EWH', 'LJ16EWU', 'LJ16EWV', 'LJ16EWW', 'LJ16EWX', 'LJ16EWZ',
       'LJ66TSO', 'LJ66TSU'], dtype=object)

In [61]:
def build_time_series_graph(th_df,TH,bus_id) :

    graph = go.Figure()
    
    #TH_DF
    series_df = th_df[th_df.datetime > th_df.datetime.max() - timedelta(seconds=TH*30)]
    
    #Loc Bus Appearances
    series_df = series_df[series_df.bus == bus_id]
    
    if series_df.shape[0] < 1 :
        return graph
    line = series_df.line.iloc[0]
    direction = series_df.direction.iloc[0]
    stops_list = lines_dict[str(line)][str(direction)]['stops']
    
    #Set title and layout
    graph.update_layout(
        title='<b>Bus {} : ETA Time Series</b> - Line: {}'.format(bus_id,line),
        legend_title='<b>Destination Stop</b>',
        yaxis = dict(
            title='ETA in Seconds',
            nticks=10,
            zerolinecolor='darkgrey'
        ),
        margin=dict(r=0, l=0, t=40, b=0),
        hovermode='closest'
    )
    
    #Locate unique stops
    unique_stops = series_df.stop.unique().tolist()
    for stop in stops_list :
        if stop not in unique_stops :
            continue
        else : 
            stop_index = stops_list.index(stop)
            
        stop_df = series_df[series_df.stop == stop]
        
        
        #Build stop trace
        graph.add_trace(go.Scatter(
            name= '[' + str(stop_index) + '] ' + str(stop),
            x=stop_df.datetime,
            y=stop_df.estimateArrive,
            mode='lines+markers',
            line=dict(width=3,color=colors[(str_to_int(stop))%len(colors)]),
            text=['<b>Bus : ' + str(bus_id) + '</b> <br>' + \
                    'Stop[' + str(stop_index) + ']: ' + str(stop) + '<br>' + \
                    'Time : ' + row.datetime.strftime("%H:%M:%S") + '<br>' + \
                    'ETA : ' + str(row.estimateArrive) for row in stop_df.itertuples() ],
            hoverinfo='text'
        ))


    return graph

In [62]:
bus_id = 'SK20BDV'
build_time_series_graph(th_df, TH, bus_id).show()

In [65]:
bus_id = 'BF67GKP'
build_time_series_graph(th_df, TH, bus_id).show()

## Datetime attribute
Should be around actual datetime, and greater than last datetime value

## Estimate Arrive Value Time Series Analysis
- Check the estimations dont follow a linearly descendant curve, it should present some irrugularities, as a straight line indicates the data has just been ponderated making use of the last "trustable" estimation.
- Check the estimations do not present high jumps in their values, this are both changes where the new estimation is much higher or much lower than the previous one.
- Check for continuity, the should be a new approximation every 30 seconds in average, if the time between estimations is higher it means that the API has not provided data for that bus.

In [27]:
#Linearity test
def check_series_linearity(th_df, TH, bus_id) : 
    #TH_DF
    series_df = th_df[th_df.datetime > th_df.datetime.max() - timedelta(seconds=TH*30)]
    
    #Loc Bus Appearances
    series_df = series_df[series_df.bus == bus_id]
    
    if series_df.shape[0] < 1 :
        return graph
    line = series_df.line.iloc[0]
    direction = series_df.direction.iloc[0]
    stops_list = lines_dict[str(line)][str(direction)]['stops']
    
    #Slope dict
    slope_dict = {}
    
    #Locate unique stops
    unique_stops = series_df.stop.unique().tolist()
    for stop in stops_list :
        if stop not in unique_stops :
            continue
        else : 
            stop_index = stops_list.index(stop)
            
        stop_df = series_df[series_df.stop == stop]
        
        slope_dict[stop] = {}
        slope_dict[stop]['slopes'] = []
        slope_dict[stop]['quality'] = []
        slope_dict[stop]['time'] = []
        
        i = 0
        for row in stop_df.itertuples() :
            eta = row.estimateArrive
            time = row.datetime
            
            if i > 0 :
                ellapsed_time = int((time-last_time).total_seconds())
                slope = (last_eta - eta)/ellapsed_time
                
                #Slopes between 0.95 and 1.05 are not good, as they provide no new information, they just ponderate
                #the new estimateArrive value using the last estimation and subtracting the ellapsed sections.
                #On the other hand, slopes of values too high or too low imply changes in the speed of the bus above
                #or below the maximum or minimum speed for a bus, respectively, also implying a bad quality of the 
                #estimation received by the API. Therefore, we should penalize both situations.
                
                #Penalize proximity to 1, and the maximum and lowest possible speeds.
                min_slope = 0
                max_slope = 2
                
                if (slope < 1) and (slope > 0) :
                    slope_quality = (1-slope)*slope
                elif (slope > 1) and (slope < 2) : 
                    slope_quality = (2-slope)*(slope-1)
                else : 
                    slope_quality = 0
                
                slope_dict[stop]['slopes'].append(slope)
                slope_dict[stop]['quality'].append(slope_quality)
                slope_dict[stop]['time'].append(time)
            
            last_time = row.datetime
            last_eta = row.estimateArrive
            i += 1
            
    return slope_dict

In [28]:
slope_dict = check_series_linearity(th_df,TH,bus_id)

NameError: name 'graph' is not defined

In [29]:
# Draw slopes vs quality series
graph = go.Figure()

#Set title and layout
graph.update_layout(
    title='<b>Bus {} : ETA Time Series</b>'.format(bus_id),
    legend_title='<b>Destination Stop</b>',
    yaxis = dict(
        nticks=10,
        zerolinecolor='darkgrey'
    ),
    margin=dict(r=0, l=0, t=40, b=0),
    hovermode='closest'
)

for stop in slope_dict.keys() :
    #Build stop traces
    graph.add_trace(go.Scatter(
        name= str(stop) + '-Slope',
        x=slope_dict[stop]['time'],
        y=slope_dict[stop]['slopes'],
        mode='lines+markers',
        line=dict(width=3,color=colors[(str_to_int(stop))%len(colors)]),
    ))
    
    graph.add_trace(go.Scatter(
        name= str(stop) + '-Quality',
        x=slope_dict[stop]['time'],
        y=slope_dict[stop]['quality'],
        mode='lines+markers',
        line=dict(width=3,color=colors[(str_to_int(stop))%len(colors)]),
    ))

graph.show()

NameError: name 'slope_dict' is not defined