# Similarity / Distance between Devices

In [158]:
from aux import *
import pandas as pd
from thefuzz import fuzz
import numpy as np
import json
from joblib import Parallel, delayed

## Devices SDF definitions

### Adapt SDF definition data for analysis

In [159]:
# Load SDF files to compare
sdf_manager = SDFManager(path='../iot/sdf/')
sdfs = sdf_manager.get_all_sdfs()

In [160]:
# Turn the SDF files into dataframes - Avoid redundant data
sdfs_trans = {}
columns = ['thing','thing_desc','obj','obj_desc','prop','prop_desc','prop_type','prop_unit']
rows = []
for name in sdfs :
    if name == 'Auxiliary':
        continue
    for sdfThing in sdfs[name]['sdfThing']:
        thing_dic = sdfs[name]['sdfThing'][sdfThing]
        thing_desc = thing_dic['description']
        for sdfObject in thing_dic['sdfObject']:
            object_dic = thing_dic['sdfObject'][sdfObject]
            object_desc = object_dic['description']
            for sdfProperty in object_dic['sdfProperty']:
                if sdfProperty == 'uuid':
                    continue
                prop_dic = object_dic['sdfProperty'][sdfProperty]
                prop_desc = prop_dic['description']
                prop_type = prop_dic['type']
                prop_unit = prop_dic['unit'] if 'unit' in prop_dic else None
                rows.append((sdfThing,thing_desc,sdfObject,object_desc,sdfProperty,prop_desc,prop_type,prop_unit))

sdfs_df = pd.DataFrame(columns=columns,data=rows)

In [161]:
# Air Quality SDF DATAFRAME
sdfs_df[sdfs_df.thing=='AirQualityModified'].iloc[:,2:].reset_index(drop=True)

Unnamed: 0,obj,obj_desc,prop,prop_desc,prop_type,prop_unit
0,temperature_humidity_sensor,Measures environmental temperature and humidity.,temperature,Temperature value,number,Cel
1,temperature_humidity_sensor,Measures environmental temperature and humidity.,humidity,Humidity value,number,%
2,air_quality_sensor,Measures air pollutants.,pm25,PM2.5 value,number,ug/m3
3,air_quality_sensor,Measures air pollutants.,pm10,PM10 value,number,ug/m3


## Devices data

### Load devices data

In [162]:
# Read devices data samples from JSON
with open('devices.json', 'r') as f:
  devices = json.load(f)

### Adapt devices data for analysis

In [177]:
# Build devices dataframe
rows = []
for dev_uuid in devices :
    dev = devices[dev_uuid]

    # Dev row initialization
    row = {
        'dev' : dev['name'],
        'uuid': dev_uuid,
        'integ': dev['integrated'],
        'timestamp': dev['timestamp'],
        'period': dev['period']
    }
        
    # Create a row for each module attribute with a column for each value in the buffer
    for mod_uuid in dev['modules']:
        row['mod'] = dev['modules'][mod_uuid]['name']
        for prop in dev['modules'][mod_uuid]['properties'] :
            row['attrib'] = prop
            buffer = dev['modules'][mod_uuid]['properties'][prop]
            i = 1
            for val in buffer :
                row[f'v{i}'] = val
                i += 1
            rows.append(row.copy())

# Build devices dataframe
devs_df = pd.DataFrame(rows)

In [178]:
# Drop columns without useful information
devs_df = devs_df.drop(columns=['integ','timestamp','period'])

In [179]:
devs_df[devs_df.dev=='AirQualityModified']

Unnamed: 0,dev,uuid,mod,attrib,v1,v2,v3,v4,v5
267,AirQualityModified,5384e1c2-b79b-4b34-b717-f5b630428465,temperature_humidity_sensor,temperature,20.79048,20.45781,20.8583,20.78462,20.94421
268,AirQualityModified,5384e1c2-b79b-4b34-b717-f5b630428465,temperature_humidity_sensor,humidity,29.21872,28.94914,28.72518,28.93833,28.9299
269,AirQualityModified,5384e1c2-b79b-4b34-b717-f5b630428465,air_quality_sensor,pm25,7.34951,7.99065,8.3523,6.84859,8.20554
270,AirQualityModified,5384e1c2-b79b-4b34-b717-f5b630428465,air_quality_sensor,pm10,19.39291,18.99501,19.55293,18.95708,19.51962


## Similarity Analysis
Given an unknown device, determine the 5 closest devices according to SDF definition similarity, and then choose the closest device by including the buffered data. 

**Possible algorithm**

    Given an unknown device, find 5 closest according to SDF definition.

    Then, for each unknown module attribute, determine which attribute resembles the most to it of those in the 5 closest devices, and vote this device UUID as the candidate to be the closest.

    Finally, the device that has the most attribute votes is chosen as the closest device.

In [180]:
# Value columns names
n_vals = 5
val_cols = [f'v{i+1}' for i in range(n_vals)]

# Turn booleans into integers and cast all data to float
devs_df[val_cols] = devs_df[val_cols].replace('False',0).replace('True',1)
devs_df[val_cols] = devs_df[val_cols].astype(float)

# Normalize data
for val_col in val_cols :
    devs_df[val_col] = devs_df[val_col] / devs_df[val_col].abs().max()

In [181]:
# Unknown device
unknown_dev_name = 'AirQualityModified'

### Determine closest devices by SDF comparison

String similarity of the fields:
- Thing and its description
- Object and its description
- Property and its description

In [182]:
unknown_sdf = sdfs_df[sdfs_df.thing==unknown_dev_name]
known_sdfs = sdfs_df[sdfs_df.thing!=unknown_dev_name]

In [199]:
# Define distance functions
def calc_str_dist(descs, row):
    return fuzz.ratio(descs,row['thing_desc'] + ' ' + row['obj'] + ' ' + row['obj_desc'] + ' ' + row['prop'] + ' ' + row['prop_desc'])

# Compute closest devices by SDF
def get_closest_devs_by_sdf(unknown_sdf,known_sdfs,i) :
    unknown_row = unknown_sdf.iloc[i].copy()

    # Build joint description 
    descs = unknown_row['thing'] + ' ' + unknown_row['thing_desc'] + ' ' + unknown_row['obj'] + ' ' + unknown_row['obj_desc'] + ' ' + unknown_row['prop'] + ' ' + unknown_row['prop_desc']

    # Calc string distances
    knowns_alike = known_sdfs[known_sdfs.prop_type==unknown_row['prop_type']].copy()
    knowns_alike['str_dist'] = knowns_alike.apply(lambda x: calc_str_dist(descs,x), axis=1)
    closest_things = knowns_alike[['thing','obj','prop','str_dist']].sort_values(by='str_dist',ascending=False)

    # Give point based on closeness
    score = 5
    vote = {}
    for row in closest_things.itertuples() :
        candidate = row.thing+'/'+row.obj+'/'+row.prop
        if candidate not in vote :
            vote[candidate] = score
            score -= 1

        if score == 0 :
            break
        
    return vote

In [200]:
# Compute distance for each unknown module attribute
votes = (Parallel(n_jobs=12)(delayed(get_closest_devs_by_sdf)(unknown_sdf,known_sdfs,i) for i in range(unknown_sdf.shape[0])))
total_vote_sdf = {}
for vote in votes:
    for candidate in vote :
        if candidate not in total_vote_sdf :
            total_vote_sdf[candidate] = vote[candidate]
        else :
            total_vote_sdf[candidate] += vote[candidate]

In [201]:
total_vote_sdf

{'AirQuality/temperature_sensor/temperature': 9,
 'AirQuality/humidity_sensor/humidity': 13,
 'AirQuality/pressure_sensor/pressure': 8,
 'AirQuality/air_quality_sensor/pm10': 13,
 'AirQuality/air_quality_sensor/pm1': 11,
 'AirQuality/air_quality_sensor/pm25': 6}

### Determine closest device UUID by buffered values comparison

Get the attribute with the closest values (euclidean distance) to each of the unknown device module attributes

In [202]:
unknown_uuid = '' # UUID of the unknown device
unknown_dev = devs_df[devs_df.dev==unknown_dev_name]
known_devs = devs_df[devs_df.dev!=unknown_dev_name]

In [203]:
unknown_dev[val_cols]

Unnamed: 0,v1,v2,v3,v4,v5
267,0.000206,0.000203,0.000207,0.000206,0.000207
268,0.000289,0.000287,0.000284,0.000287,0.000286
269,7.3e-05,7.9e-05,8.3e-05,6.8e-05,8.1e-05
270,0.000192,0.000188,0.000194,0.000188,0.000193


In [204]:
# Compute closest devices by values comparison
def get_closest_devs_by_values(unknown_dev,known_devs,i) :
    unknown_row = unknown_dev.iloc[i].copy()

    # Calc string distances
    knowns_alike = known_devs.copy()
    knowns_alike['dist'] = ((knowns_alike[val_cols] - unknown_row[val_cols])**2).sum(axis=1)**0.5
    closest_things = knowns_alike[['uuid','dev','mod','attrib','dist']].sort_values(by='dist',ascending=True)

    # Give points based on closeness
    score = 5
    vote = {}
    for row in closest_things.itertuples() :
        candidate = row.uuid+'/'+row.dev+'/'+row.mod+'/'+row.attrib
        if candidate not in vote :
            vote[candidate] = score
            score -= 1

        if score == 0 :
            break
        
    return vote

In [205]:
# Compute distance for each unknown module attribute
votes = (Parallel(n_jobs=12)(delayed(get_closest_devs_by_values)(unknown_dev,known_devs,i) for i in range(unknown_dev.shape[0])))
total_vote_dev = {}
for vote in votes:
    for candidate in vote :
        if candidate not in total_vote_dev :
            total_vote_dev[candidate] = vote[candidate]
        else :
            total_vote_dev[candidate] += vote[candidate]

In [206]:
total_vote_dev

{'c11c3f56-0f26-415f-a00d-3bb929f5ca20/AirQuality/temperature_sensor/temperature': 10,
 'c11c3f56-0f26-415f-a00d-3bb929f5ca20/AirQuality/air_quality_sensor/pm10': 8,
 'f342e60b-6a54-4f20-8874-89a550ebc75c/ConveyorBelt/conveyor_belt/rotational_speed': 7,
 '3140ce5c-0d08-4aff-9bb4-14a9e6a33d12/ConveyorBelt/conveyor_belt/rotational_speed': 6,
 'f169a965-bb15-4db3-97cd-49b5b641a9fe/ConveyorBelt/conveyor_belt/rotational_speed': 5,
 'c11c3f56-0f26-415f-a00d-3bb929f5ca20/AirQuality/humidity_sensor/humidity': 5,
 'a6f65d7a-019a-4723-9b81-fb4a163fa23a/ConveyorBelt/conveyor_belt/rotational_speed': 4,
 'c11c3f56-0f26-415f-a00d-3bb929f5ca20/AirQuality/air_quality_sensor/pm25': 5,
 'd7295016-4a54-4c98-a4c1-4f0c7f7614b5/PieceDetector/piece_detection_cam/pitch_orientation': 4,
 '3140ce5c-0d08-4aff-9bb4-14a9e6a33d12/ConveyorBelt/conveyor_belt/weight': 3,
 '70a15d0b-f6d3-4833-b929-74abdff69fa5/RainSensor/rain_sensor/cumdepth': 2,
 'f342e60b-6a54-4f20-8874-89a550ebc75c/ConveyorBelt/conveyor_belt/weight'