# Similarity / Distance between Devices

In [146]:
from aux import *
import pandas as pd
from thefuzz import fuzz
import numpy as np
import json
from joblib import Parallel, delayed
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
plt.style.use('default')

## Devices SDF definitions

### Adapt SDF definition data for analysis

In [147]:
# Load SDF files to compare
sdf_manager = SDFManager(path='../iot/sdf/')
sdfs = sdf_manager.get_all_sdfs()

In [148]:
# Turn the SDF files into dataframes - Avoid redundant data
sdfs_trans = {}
columns = ['thing','thing_desc','obj','obj_desc','prop','prop_desc','prop_type','prop_unit']
rows = []
for name, sdf in sdfs.items() :
    if name == 'Auxiliary':
        continue
    for sdfThing, thing_dic in sdf['sdfThing'].items():
        thing_desc = thing_dic['description']
        for sdfObject, object_dic in thing_dic['sdfObject'].items():
            object_desc = object_dic['description']
            for sdfProperty, prop_dic in object_dic['sdfProperty'].items():
                if sdfProperty == 'uuid': continue
                prop_desc = prop_dic['description']
                prop_type = prop_dic['type']
                prop_unit = prop_dic['unit'] if 'unit' in prop_dic else None
                rows.append((sdfThing,thing_desc,sdfObject,object_desc,sdfProperty,prop_desc,prop_type,prop_unit))

sdfs_df = pd.DataFrame(columns=columns,data=rows)

In [149]:
# Air Quality SDF DATAFRAME
sdfs_df[sdfs_df.thing=='AirQualityModified'].iloc[:,2:].reset_index(drop=True)

Unnamed: 0,obj,obj_desc,prop,prop_desc,prop_type,prop_unit
0,temperature_humidity_sensor,Measures environmental temperature and humidity.,temperature,Temperature value,number,Cel
1,temperature_humidity_sensor,Measures environmental temperature and humidity.,humidity,Humidity value,number,%
2,air_quality_sensor,Measures air pollutants.,pm25,PM2.5 value,number,ug/m3
3,air_quality_sensor,Measures air pollutants.,pm10,PM10 value,number,ug/m3


## Devices data

### Load devices data

In [154]:
# Read devices data samples from JSON
with open('devices.json', 'r') as f: devices = json.load(f)

### Adapt devices data for analysis

In [155]:
# Build devices dataframe
rows = []
for dev_uuid, dev in devices.items() :
    # Dev row initialization
    row = {
        'uuid': dev_uuid,
        'dev' : dev['name'],
        'integ': dev['integrated'],
        'period': dev['period']
    }
        
    # Create a row for each module attribute with a column for each value in the buffer
    for mod_uuid, mod in dev['modules'].items() :
        row['mod'] = mod['name']
        for prop_name, values in mod['attribs'].items() :
            row['attrib'] = prop_name
            for i, val in enumerate(values) : row[f'v{i+1}'] = val
            rows.append(row.copy())

# Build devices dataframe
devs_df = pd.DataFrame(rows)

In [156]:
# Drop columns without useful information
devs_df = devs_df.drop(columns=['integ','period'])

## Similarity Analysis
Given an unknown device, determine the 5 closest devices according to SDF definition similarity, and then choose the closest device by including the buffered data. 

**Possible algorithm**

    Given an unknown device(thing) module(object) attribute(property), find 5 closest thing-object-property tuples according to SDF definition.

    Then, for each unknown module attribute, determine which attribute resembles the most to it of those in the 5 closest devices, and vote this device UUID as the candidate to be the closest.

    Finally, the device that has the most attribute votes is chosen as the closest device.

In [157]:
# Value columns names
n_vals = 50
val_cols = [f'v{i+1}' for i in range(n_vals)]

# Turn booleans into integers and cast all data to float
devs_df[val_cols] = devs_df[val_cols].replace('False',0).replace('True',1)
devs_df[val_cols] = devs_df[val_cols].astype(float)

In [158]:
# Air Quality Modified DEVICES DATAFRAME
devs_df[devs_df.dev=='AirQualityModified']

Unnamed: 0,uuid,dev,mod,attrib,v1,v2,v3,v4,v5,v6,...,v66,v67,v68,v69,v70,v71,v72,v73,v74,v75
189,40189078-ebd2-44b8-b3e2-82a38c33d12e,AirQualityModified,temperature_humidity_sensor,temperature,19.215731,19.029963,18.969984,18.914633,18.912332,18.864083,...,19.213523,19.319408,,,,,,,,
190,40189078-ebd2-44b8-b3e2-82a38c33d12e,AirQualityModified,temperature_humidity_sensor,humidity,25.574835,25.467388,25.629369,25.564758,25.49742,25.429751,...,25.443393,25.665027,,,,,,,,
191,40189078-ebd2-44b8-b3e2-82a38c33d12e,AirQualityModified,air_quality_sensor,pm25,8.360936,8.355479,8.345616,8.327888,8.359649,8.299514,...,8.157054,8.10083,,,,,,,,
192,40189078-ebd2-44b8-b3e2-82a38c33d12e,AirQualityModified,air_quality_sensor,pm10,16.852453,16.909097,16.850088,16.849307,16.766102,16.766724,...,16.59534,16.416574,,,,,,,,


In [159]:
# Unknown device
unknown_dev_name = 'AirQualityModified'

### Determine closest devices by SDF comparison

String similarity of the fields:
- Thing and its description
- Object and its description
- Property and its description

In [160]:
unknown_sdf = sdfs_df[sdfs_df.thing==unknown_dev_name]
known_sdfs = sdfs_df[sdfs_df.thing!=unknown_dev_name]

In [161]:
# Define distance functions
def calc_str_dist(descs, row):
    return fuzz.ratio(descs,row['thing_desc'] + ' ' + row['obj'] + ' ' + row['obj_desc'] + ' ' + row['prop'] + ' ' + row['prop_desc'])

# Compute closest devices by SDF
def get_closest_devs_by_sdf(unknown_sdf,known_sdfs,i) :
    # Create local copies
    unknown_row = unknown_sdf.iloc[i].copy()
    knowns_alike = known_sdfs[known_sdfs.prop_type==unknown_row['prop_type']].copy()

    # Build joint description 
    descs = unknown_row['thing'] + ' ' + unknown_row['thing_desc'] + ' ' + unknown_row['obj'] + ' ' + unknown_row['obj_desc'] + ' ' + unknown_row['prop'] + ' ' + unknown_row['prop_desc']

    # Calc string distances
    knowns_alike['str_dist'] = knowns_alike.apply(lambda x: calc_str_dist(descs,x), axis=1)
    closest_things = knowns_alike[['thing','obj','prop','str_dist']].sort_values(by='str_dist',ascending=False)

    # Give points based on closeness
    score = 5
    vote = {}
    for row in closest_things.itertuples() :
        if score == 0 : break
        if row.thing in vote : continue
        vote[row.thing] = score
        score -= 1
        
    return vote

In [162]:
# Compute distance for each unknown module attribute
votes = (Parallel(n_jobs=12)(delayed(get_closest_devs_by_sdf)(unknown_sdf,known_sdfs,i) for i in range(unknown_sdf.shape[0])))
total_vote_sdf = {}
for vote in votes:
    for candidate, score in vote.items() :
        if candidate not in total_vote_sdf :
            total_vote_sdf[candidate] = score
        else :
            total_vote_sdf[candidate] += score

total_vote_sdf_df = pd.DataFrame(total_vote_sdf.items(),columns=['candidate','score']).sort_values(by='score',ascending=False)

In [163]:
total_vote_sdf_df

Unnamed: 0,candidate,score
0,AirQuality,20
1,NoiseSensor,16
3,RainSensor,10
2,SeismicSensor,7
4,WindSensor,6
5,RepairControl,1


### Compare AirQuality Devices

In [164]:
# Select data to plot
dev1_uuid = 'c11c3f56-0f26-415f-a00d-3bb929f5ca20'
dev1_df = devs_df[devs_df.uuid == dev1_uuid]
dev1_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev1_uuid]['timestamps']]

dev2_uuid = '5362cb80-381d-4d21-87ba-af283640fa98'
dev2_df = devs_df[devs_df.uuid == dev2_uuid]
dev2_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev2_uuid]['timestamps']]

dev3_uuid = devs_df[devs_df.dev=='AirQualityModified'].uuid.iloc[0]
dev3_df = devs_df[devs_df.uuid == dev3_uuid]
dev3_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev3_uuid]['timestamps']]

devs = [dev1_df,dev2_df,dev3_df]
ts = [dev1_timestamps,dev2_timestamps,dev3_timestamps]

In [165]:
names = []
fig = go.Figure()
for i, (dev, t) in enumerate(zip(devs,ts)) :
    for j, row in dev.iterrows() :
        dev_uuid = row.uuid
        dev_name, mod_name, attrib_name = row['dev'], row['mod'], row['attrib']
        name = str(i) + '/' + dev_name + '/' + mod_name + '/' + attrib_name
        fig.add_trace(go.Scatter(x=t, y=row[val_cols],
                    mode='lines+markers',
                    name=name))
        names.append(name)
fig.show()

### Compare PickUpRobot Devices

In [166]:
# Select data to plot
dev1_uuid = '5f3333b9-8292-4371-b5c5-c1ec21d0b652'
dev1_df = devs_df[devs_df.uuid == dev1_uuid]
dev1_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev1_uuid]['timestamps']]

dev2_uuid = 'da0ba61c-a9bf-4e0d-b975-33b7b4c5d2e8'
dev2_df = devs_df[devs_df.uuid == dev2_uuid]
dev2_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev2_uuid]['timestamps']]

devs = [dev1_df,dev2_df]
ts = [dev1_timestamps,dev2_timestamps,dev3_timestamps]

In [167]:
names = []
fig = go.Figure()
for i, (dev, t) in enumerate(zip(devs,ts)) :
    for j, row in dev.iterrows() :
        dev_uuid = row.uuid
        dev_name, mod_name, attrib_name = row['dev'], row['mod'], row['attrib']
        name = str(i) + '/' + dev_name + '/' + mod_name + '/' + attrib_name
        fig.add_trace(go.Scatter(x=t, y=row[val_cols],
                    mode='lines+markers',
                    name=name))
        names.append(name)
fig.show()

### Determine closest device UUID by buffered values comparison

Get the attribute with the closest values (euclidean distance) to each of the unknown device module attributes

In [168]:
unknown_uuid = '' # UUID of the unknown device
unknown_dev = devs_df[devs_df.dev==unknown_dev_name]
known_devs = devs_df[devs_df.dev!=unknown_dev_name]

In [183]:
# Compute closest devices by values comparison
def get_closest_devs_by_values(unknown_dev,known_devs,i) :
    # Create local copies
    unknown_row = unknown_dev.iloc[i].copy()
    knowns_alike = known_devs.copy()

    # Normalize values
    max_val, min_val = knowns_alike[val_cols].max(), knowns_alike[val_cols].min()
    knowns_alike[val_cols] = (knowns_alike[val_cols] - min_val)/(max_val-min_val)
    unknown_row[val_cols] = (unknown_row[val_cols] - min_val)/(max_val-min_val)

    # Calculate euclidean distance and get closest devices
    knowns_alike['dist'] = ((knowns_alike[val_cols] - unknown_row[val_cols])**2).sum(axis=1)**0.5
    closest_things = knowns_alike[['uuid','dev','mod','attrib','dist']].sort_values(by='dist',ascending=True)

    # Give points based on closeness
    score = 5
    vote = {}
    for row in closest_things.itertuples() :
        if score == 0 : break
        if row.uuid in vote : continue
        vote[row.dev+'/'+row.uuid[:8]] = score
        score -= 1
        
    return vote

In [184]:
# Compute distance for each unknown module attribute
votes = (Parallel(n_jobs=12)(delayed(get_closest_devs_by_values)(unknown_dev,known_devs,i) for i in range(unknown_dev.shape[0])))
total_vote_dev = {}
for vote in votes:
    for candidate in vote :
        if candidate not in total_vote_dev :
            total_vote_dev[candidate] = vote[candidate]
        else :
            total_vote_dev[candidate] += vote[candidate]
total_vote_dev_df = pd.DataFrame(total_vote_dev.items(),columns=['candidate','score']).sort_values(by='score',ascending=False)

In [185]:
total_vote_dev_df

Unnamed: 0,candidate,score
0,AirQuality/5362cb80,15
1,AirQuality/c11c3f56,9
2,TagScanner/8a40d136,7
4,RainSensor/70a15d0b,4
3,ProductionControl/3d193d4c,1
