# Similarity / Distance between Devices

In [1]:
from aux import *

import pandas as pd
import numpy as np

from thefuzz import fuzz
from joblib import Parallel, delayed
from datetime import datetime, timedelta

import json

import plotly.graph_objects as go
import matplotlib.pyplot as plt
plt.style.use('default')

In [2]:
#Available colors
colors = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
]

## Devices SDF definitions

### Adapt SDF definition data for analysis

In [3]:
# Load SDF files to compare
sdf_manager = SDFManager(path='../iot/sdf/')
sdfs = sdf_manager.get_all_sdfs()

In [4]:
# Turn the SDF files into dataframes - Avoid redundant data
sdfs_trans = {}
columns = ['thing','thing_desc','obj','obj_desc','prop','prop_desc','prop_type','prop_unit']
rows = []
for name, sdf in sdfs.items() :
    if name == 'Auxiliary':
        continue
    for sdfThing, thing_dic in sdf['sdfThing'].items():
        thing_desc = thing_dic['description']
        for sdfObject, object_dic in thing_dic['sdfObject'].items():
            object_desc = object_dic['description']
            for sdfProperty, prop_dic in object_dic['sdfProperty'].items():
                if sdfProperty == 'uuid': continue
                prop_desc = prop_dic['description']
                prop_type = prop_dic['type']
                prop_unit = prop_dic['unit'] if 'unit' in prop_dic else None
                rows.append((sdfThing,thing_desc,sdfObject,object_desc,sdfProperty,prop_desc,prop_type,prop_unit))

sdfs_df = pd.DataFrame(columns=columns,data=rows)

In [5]:
# Air Quality SDF DATAFRAME
sdfs_df[sdfs_df.thing=='AirQualityModified'].iloc[:,2:].reset_index(drop=True)

Unnamed: 0,obj,obj_desc,prop,prop_desc,prop_type,prop_unit
0,temperature_humidity_sensor,Measures environmental temperature and humidity.,temperature,Temperature value,number,Cel
1,temperature_humidity_sensor,Measures environmental temperature and humidity.,humidity,Humidity value,number,%
2,air_quality_sensor,Measures air pollutants.,pm25,PM2.5 value,number,ug/m3
3,air_quality_sensor,Measures air pollutants.,pm10,PM10 value,number,ug/m3


## Devices data

### Load devices data

In [61]:
# Read devices data samples from JSON
with open('devices.json', 'r') as f: devices = json.load(f)

### Adapt devices data for analysis

In [62]:
# Build devices dataframe
rows = []
for dev_uuid, dev in devices.items() :
    # Dev row initialization
    row = {
        'uuid': dev_uuid,
        'dev' : dev['name'],
        'integ': dev['integrated'],
        'period': dev['period']
    }
        
    # Create a row for each module attribute with a column for each value in the buffer
    for mod_uuid, mod in dev['modules'].items() :
        row['mod'] = mod['name']
        for prop_name, values in mod['attribs'].items() :
            row['attrib'] = prop_name
            for i, val in enumerate(values) : row[f'v{i+1}'] = val
            rows.append(row.copy())

# Build devices dataframe
devs_df = pd.DataFrame(rows)

In [63]:
# Drop columns without useful information
devs_df = devs_df.drop(columns=['integ','period'])

## Similarity Analysis
Given an unknown device, determine the 5 closest devices according to SDF definition similarity, and then choose the closest device by including the buffered data. 

**Possible algorithm**

    Given an unknown device(thing) module(object) attribute(property), find 5 closest thing-object-property tuples according to SDF definition.

    Then, for each unknown module attribute, determine which attribute resembles the most to it of those in the 5 closest devices, and vote this device UUID as the candidate to be the closest.

    Finally, the device that has the most attribute votes is chosen as the closest device.

In [64]:
# Value columns names
n_vals = 50
val_cols = [f'v{i+1}' for i in range(n_vals)]

# Turn booleans into integers and cast all data to float
devs_df[val_cols] = devs_df[val_cols].replace('False',0).replace('True',1)
devs_df[val_cols] = devs_df[val_cols].astype(float)

In [65]:
# Air Quality Modified DEVICES DATAFRAME
devs_df[devs_df.dev=='AirQualityModified']

Unnamed: 0,uuid,dev,mod,attrib,v1,v2,v3,v4,v5,v6,...,v51,v52,v53,v54,v55,v56,v57,v58,v59,v60
189,indoors_airqualitymod,AirQualityModified,temperature_humidity_sensor,temperature,19.864309,19.834325,19.893551,19.725407,19.847627,19.740174,...,19.771578,19.826954,19.82218,19.894821,19.749094,19.750906,19.710007,19.701535,19.71251,19.716326
190,indoors_airqualitymod,AirQualityModified,temperature_humidity_sensor,humidity,25.882915,25.818179,25.790794,25.793258,25.777855,25.755883,...,25.256878,25.194089,25.154646,25.193148,25.137304,25.184707,25.23743,25.172923,25.176327,25.176985
191,indoors_airqualitymod,AirQualityModified,air_quality_sensor,pm25,9.949378,10.031571,10.029866,9.933761,9.94494,9.948608,...,10.131974,10.132604,10.128916,10.243327,10.12222,10.12915,10.144447,10.125856,10.109855,10.115735
192,indoors_airqualitymod,AirQualityModified,air_quality_sensor,pm10,16.303309,16.198723,16.225067,16.205637,16.184716,16.281577,...,16.500387,16.532473,16.508134,16.517398,16.516982,16.581446,16.680449,16.608201,16.547875,16.524052


In [80]:
# Unknown device
unknown_dev_name = 'PickUpRobot'

### Determine closest devices by SDF comparison

String similarity of the fields:
- Thing and its description
- Object and its description
- Property and its description

In [81]:
unknown_sdf = sdfs_df[sdfs_df.thing==unknown_dev_name]
known_sdfs = sdfs_df[sdfs_df.thing!=unknown_dev_name]

In [82]:
# Define distance functions
def calc_str_dist(descs, row):
    return fuzz.ratio(descs,row['thing_desc'] + ' ' + row['obj'] + ' ' + row['obj_desc'] + ' ' + row['prop'] + ' ' + row['prop_desc'])

# Compute closest devices by SDF
def get_closest_devs_by_sdf(unknown_sdf,known_sdfs,i) :
    # Create local copies
    unknown_row = unknown_sdf.iloc[i].copy()
    knowns_alike = known_sdfs[known_sdfs.prop_type==unknown_row['prop_type']].copy()

    # Build joint description 
    descs = unknown_row['thing'] + ' ' + unknown_row['thing_desc'] + ' ' + unknown_row['obj'] + ' ' + unknown_row['obj_desc'] + ' ' + unknown_row['prop'] + ' ' + unknown_row['prop_desc']

    # Calc string distances
    knowns_alike['str_dist'] = knowns_alike.apply(lambda x: calc_str_dist(descs,x), axis=1)
    closest_things = knowns_alike[['thing','obj','prop','str_dist']].sort_values(by='str_dist',ascending=False)

    # Give points based on closeness
    score = 3
    vote = {}
    for row in closest_things.itertuples() :
        if score == 0 : break
        if row.thing in vote : continue
        vote[row.thing] = score
        score -= 1
        
    return vote

In [83]:
# Compute distance for each unknown module attribute
votes = (Parallel(n_jobs=12)(delayed(get_closest_devs_by_sdf)(unknown_sdf,known_sdfs,i) for i in range(unknown_sdf.shape[0])))
total_vote_sdf = {}
for vote in votes:
    for candidate, score in vote.items() :
        if candidate not in total_vote_sdf :
            total_vote_sdf[candidate] = score
        else :
            total_vote_sdf[candidate] += score

total_vote_sdf_df = pd.DataFrame(total_vote_sdf.items(),columns=['candidate','score']).sort_values(by='score',ascending=False)

In [84]:
total_vote_sdf_df

Unnamed: 0,candidate,score
1,DrillingRobot,28
2,ClampingRobot,28
0,MillingRobot,22


In [85]:
# Get the top 5 closest device class by SDF comparison*
closest_sdf_devs = total_vote_sdf_df.candidate.iloc[0:5].tolist()

### Compare AirQuality Devices

In [72]:
# Select data to plot
dev1_uuid = 'indoors_airquality'
dev1_df = devs_df[devs_df.uuid == dev1_uuid]
dev1_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev1_uuid]['timestamps']]

dev2_uuid = 'c11c3f56-0f26-415f-a00d-3bb929f5ca20'
dev2_df = devs_df[devs_df.uuid == dev2_uuid]
dev2_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev2_uuid]['timestamps']]

dev3_uuid = 'indoors_airqualitymod'
dev3_df = devs_df[devs_df.uuid == dev3_uuid]
dev3_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev3_uuid]['timestamps']]

devs = [dev1_df,dev2_df,dev3_df]
ts = [dev1_timestamps,dev2_timestamps,dev3_timestamps]

In [73]:
names = []
fig = go.Figure()
for i, (dev, t) in enumerate(zip(devs,ts)) :
    for j, row in dev.iterrows() :
        dev_uuid = row.uuid
        dev_name, mod_name, attrib_name = row['dev'], row['mod'], row['attrib']
        name = row.uuid + '/' + dev_name + '/' + mod_name + '/' + attrib_name
        fig.add_trace(go.Scatter(x=t, y=row[val_cols],
                    mode='lines+markers',
                    name=name,
                    line = dict(color=colors[i])))
        names.append(name)
fig.show()

### Compare PickUpRobot Devices

In [74]:
# Select data to plot
dev1_uuid = 'bodyconfig_pickuprob'
dev1_df = devs_df[devs_df.uuid == dev1_uuid]
dev1_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev1_uuid]['timestamps']]

dev2_uuid = '6625b9ac-55e2-49c8-ab47-d1da21b5f0b5'
dev2_df = devs_df[devs_df.uuid == dev2_uuid]
dev2_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev2_uuid]['timestamps']]

dev3_uuid = 'bodyconfig_pickuprob2'
dev3_df = devs_df[devs_df.uuid == dev3_uuid]
dev3_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev3_uuid]['timestamps']]

devs = [dev1_df,dev2_df,dev3_df]
ts = [dev1_timestamps,dev2_timestamps,dev3_timestamps]

In [93]:
names = []
fig = go.Figure()
for i, (dev, t) in enumerate(zip(devs,ts)) :
    for j, row in dev.iterrows() :
        dev_uuid = row.uuid
        dev_name, mod_name, attrib_name = row['dev'], row['mod'], row['attrib']
        name = row.uuid + '/' + dev_name + '/' + mod_name + '/' + attrib_name
        fig.add_trace(go.Scatter(x=t, y=row[val_cols],
                    mode='lines+markers',
                    name=name,
                    line = dict(color=colors[i+7])))
        names.append(name)
fig.show()

### Determine closest device UUID by buffered values comparison

Get the attribute with the closest values (euclidean distance) to each of the unknown device module attributes

In [86]:
pickup_uuid = 'bodyconfig_pickuprob2'
airqualitymod_uuid = 'indoors_airqualitymod'
unknown_uuid = pickup_uuid # UUID of the unknown device
unknown_dev = devs_df[devs_df.uuid==unknown_uuid]
known_devs = devs_df[devs_df.uuid!=unknown_uuid]

In [87]:
# Compute closest devices by values comparison
def get_closest_devs_by_values(unknown_dev,known_devs,i) :
    # Create local copies
    unknown_row = unknown_dev.iloc[i].copy()
    closest_sdf_devs.append(unknown_row.dev)
    knowns_alike = known_devs[known_devs.dev.isin(closest_sdf_devs)].copy()
    
    # Normalize values
    max_val, min_val = knowns_alike[val_cols].max(), knowns_alike[val_cols].min()
    knowns_alike[val_cols] = (knowns_alike[val_cols] - min_val)/(max_val-min_val)
    unknown_row[val_cols] = (unknown_row[val_cols] - min_val)/(max_val-min_val)

    # Calculate euclidean distance and get closest devices
    knowns_alike['dist'] = ((knowns_alike[val_cols] - unknown_row[val_cols])**2).sum(axis=1)**0.5
    closest_devs = knowns_alike[['uuid','dev','mod','attrib','dist']].sort_values(by='dist',ascending=True)

    # Give points based on closeness
    closest_dev = closest_devs.iloc[0]
    vote = {closest_dev.dev+'/'+closest_dev.uuid : 1}
    return vote

In [88]:
# Compute distance for each unknown module attribute
votes = (Parallel(n_jobs=12)(delayed(get_closest_devs_by_values)(unknown_dev,known_devs,i) for i in range(unknown_dev.shape[0])))
total_vote_dev = {}
for vote in votes:
    for candidate in vote :
        if candidate not in total_vote_dev :
            total_vote_dev[candidate] = vote[candidate]
        else :
            total_vote_dev[candidate] += vote[candidate]
total_vote_dev_df = pd.DataFrame(total_vote_dev.items(),columns=['candidate','score']).sort_values(by='score',ascending=False)

In [89]:
total_vote_dev_df

Unnamed: 0,candidate,score
0,PickUpRobot/bodyconfig_pickuprob,9
1,PickUpRobot/6625b9ac-55e2-49c8-ab47-d1da21b5f0b5,1
2,MillingRobot/5ce94c31-3004-431e-97b3-c8f779fb180d,1
3,DrillingRobot/98247600-c4fe-4728-bda6-ed8fadf8...,1
4,PickUpRobot/ae5e4ad3-bd59-4dc8-b242-e72747d187d4,1


## USE SIMILARITY ANALYSIS TO SOLVE CASES. 
### Case 1. A KNOWN DEVICE DISAPPEARS AND A NEW ONE WITH SIMILAR CHARACTERISTICS APPEARS
### Case 2. A COMPLEMENTARY DEVICE APPEARS IN A TASK

Explain what needs to be done to be able to tackle CASE 3. What others steps should we take?
Additionally, explain the weaknesses of our similarity comparison and how could these be tackled.