# Similarity / Distance between Devices

In [55]:
from aux import *
import pandas as pd
from thefuzz import fuzz
from scipy import spatial
import numpy as np
from joblib import Parallel, delayed

## Devices SDF definitions

### Adapt SDF definition data for analysis

In [2]:
# Load SDF files to compare
sdf_manager = SDFManager(path='../iot/sdf/')
sdfs = sdf_manager.get_all_sdfs()

In [3]:
# Turn the SDF files into dataframes - Avoid redundant data
sdfs_trans = {}
columns = ['thing','thing_desc','obj','obj_desc','prop','prop_desc','prop_type','prop_unit']
rows = []
for name in sdfs :
    if name == 'Auxiliary':
        continue
    for sdfThing in sdfs[name]['sdfThing']:
        thing_dic = sdfs[name]['sdfThing'][sdfThing]
        thing_desc = thing_dic['description']
        for sdfObject in thing_dic['sdfObject']:
            object_dic = thing_dic['sdfObject'][sdfObject]
            object_desc = object_dic['description']
            for sdfProperty in object_dic['sdfProperty']:
                if sdfProperty == 'uuid':
                    continue
                prop_dic = object_dic['sdfProperty'][sdfProperty]
                prop_desc = prop_dic['description']
                prop_type = prop_dic['type']
                prop_unit = prop_dic['unit'] if 'unit' in prop_dic else None
                rows.append((sdfThing,thing_desc,sdfObject,object_desc,sdfProperty,prop_desc,prop_type,prop_unit))

sdfs_df = pd.DataFrame(columns=columns,data=rows)

In [4]:
# Air Quality SDF DATAFRAME
sdfs_df[sdfs_df.thing=='AirQuality'].iloc[:,2:].reset_index(drop=True)

Unnamed: 0,obj,obj_desc,prop,prop_desc,prop_type,prop_unit
0,temperature_sensor,Measures environmental temperature.,temperature,Temperature value,number,Cel
1,humidity_sensor,Measures environmental humidity.,humidity,Humidity value,number,%
2,pressure_sensor,Measures environmental pressure.,pressure,Pressure value,number,Pa
3,air_quality_sensor,Measures pollutants in the air.,pm1,"PM1 (viruses, exhaust gases...) value",number,ug/m3
4,air_quality_sensor,Measures pollutants in the air.,pm25,"PM2.5 (bacteria, spores, pollen, toner dust......",number,ug/m3
5,air_quality_sensor,Measures pollutants in the air.,pm10,"PM10 (pollen, desert dust...) value",number,ug/m3


### Load devices data

In [5]:
# Read devices data samples from CSV
devs_data = pd.read_csv('../iot/devs_data.csv')

In [6]:
# Air Quality DEVS DATA DATAFRAME
devs_data[devs_data.thing=='AirQuality'].iloc[:,1:].reset_index(drop=True)

Unnamed: 0,obj,prop,v1,v2,v3,v4,v5,v6,v7,v8,...,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20
0,temperature_sensor,temperature,19.6359,19.96069,19.95137,20.31138,20.15922,19.95604,20.13401,19.57217,...,20.30184,20.06558,20.12373,20.03259,19.68972,19.84067,19.76968,19.87122,19.91263,20.32292
1,humidity_sensor,humidity,30.16606,30.02196,29.81765,29.86381,29.80249,29.59743,30.48938,29.73078,...,29.95251,30.46365,29.82392,29.73591,29.87635,30.11619,29.88749,30.28269,30.19477,29.86159
2,pressure_sensor,pressure,101000.4757,101000.02356,100999.77513,100999.9745,101000.0472,101000.01795,101000.59712,101000.16154,...,100999.71849,101000.21537,101000.43343,101000.12959,101000.01488,101000.25356,100999.97212,100999.89998,101000.08383,101000.21428
3,air_quality_sensor,pm1,1.08826,1.35256,0.90329,1.49218,1.19251,0.82273,1.5,0.73132,...,1.29724,0.53571,0.57556,1.06245,0.5,0.78115,0.95743,1.5,1.04886,1.42953
4,air_quality_sensor,pm25,8.40574,8.94146,9.03536,9.37956,9.04236,8.66153,8.8353,9.00673,...,8.84083,8.75241,9.07795,8.37511,8.72918,9.10433,8.95236,9.37119,8.65299,8.93699
5,air_quality_sensor,pm10,17.24421,18.3329,17.82239,17.30163,17.50986,17.8858,18.39376,17.7373,...,18.78169,18.40974,17.65326,17.12828,18.61161,17.66559,18.00635,18.61981,18.23039,18.29304


### Merge data with sdf definition on a single DataFrame

In [7]:
# Merge both information sources
data = sdfs_df.merge(devs_data,how='inner',on=['thing','obj','prop'])

In [8]:
# Air Quality DATA DATAFRAME
data[data.thing=='AirQuality'].iloc[:,:11].reset_index(drop=True)

Unnamed: 0,thing,thing_desc,obj,obj_desc,prop,prop_desc,prop_type,prop_unit,v1,v2,v3
0,AirQuality,Monitors air quality through a set of sensors,temperature_sensor,Measures environmental temperature.,temperature,Temperature value,number,Cel,19.6359,19.96069,19.95137
1,AirQuality,Monitors air quality through a set of sensors,humidity_sensor,Measures environmental humidity.,humidity,Humidity value,number,%,30.16606,30.02196,29.81765
2,AirQuality,Monitors air quality through a set of sensors,pressure_sensor,Measures environmental pressure.,pressure,Pressure value,number,Pa,101000.4757,101000.02356,100999.77513
3,AirQuality,Monitors air quality through a set of sensors,air_quality_sensor,Measures pollutants in the air.,pm1,"PM1 (viruses, exhaust gases...) value",number,ug/m3,1.08826,1.35256,0.90329
4,AirQuality,Monitors air quality through a set of sensors,air_quality_sensor,Measures pollutants in the air.,pm25,"PM2.5 (bacteria, spores, pollen, toner dust......",number,ug/m3,8.40574,8.94146,9.03536
5,AirQuality,Monitors air quality through a set of sensors,air_quality_sensor,Measures pollutants in the air.,pm10,"PM10 (pollen, desert dust...) value",number,ug/m3,17.24421,18.3329,17.82239


### Distance between devices
Given an unknown device with its SDF definition and buffered data, determine which already known devices are closer.

**Possible algorithm**

    Given an unknown device, iterate over each of his module's attributes:
    Determine to which known module attribute the unknown module attribute resembles the most, and store its parent device name (could also be parent task).
    Once we have iterated over all modules properties, build a ranking of the closest devices where each module attribute votes his candidate.
    The devices with the higher number of votes will be the closer ones.

In [92]:
# Value columns - Just interested in x values
n_values = 3
val_cols = [f'v{i+1}' for i in range(n_values)]

In [93]:
# Turn prop_type and prop_unit labels into integers
prop_units = data.prop_unit.unique()
prop_unit_dic = {prop_units[i]: i for i in range(len(prop_units))}
data.prop_unit = data.prop_unit.map(prop_unit_dic)
# Turn booleans into integers
data[val_cols] = data[val_cols].replace('False',0).replace('True',1)
data[val_cols] = data[val_cols].astype(float)
# Normalize data
for column in val_cols :
    data[column] = data[column] / data[column].abs().max()

In [105]:
# Unknown device: 
unknown_thing = 'FaultNotifier'
unknown = data[data.thing==unknown_thing].iloc[:,:11].reset_index(drop=True)
unknown['str_dist'] = 1
# Known devices:
knowns = data[data.thing!=unknown_thing].iloc[:,:11].reset_index(drop=True)

In [106]:
# Define distance functions
def calc_str_dist(descs, row):
    return fuzz.ratio(descs,row['thing_desc'] + ' ' + row['obj'] + ' ' + row['obj_desc'] + ' ' + row['prop'] + ' ' + row['prop_desc'])/100

# Compute closest thing to a given unknown thing module attribute
def get_closest_thing(unknown_row) :
    unknown_row = unknown.iloc[0].copy()
    # Build joint description 
    descs = unknown_row['thing'] + ' ' + unknown_row['thing_desc'] + ' ' + unknown_row['obj'] + ' ' + unknown_row['obj_desc'] + ' ' + unknown_row['prop'] + ' ' + unknown_row['prop_desc']

    # Calc string distances
    knowns_alike = knowns[knowns.prop_type==unknown_row['prop_type']].copy()
    knowns_alike['str_dist'] = knowns_alike.apply(lambda x: calc_str_dist(descs,x), axis=1)
    knowns_alike['dist'] = ((knowns_alike[['str_dist','v1','v2','v3']] - unknown_row[['str_dist','v1','v2','v3']])**2).sum(axis=1)**0.5
    closest_thing = knowns_alike[['thing','dist']].sort_values(by='dist').iloc[0].thing
    return closest_thing

In [107]:
# Compute distance for each unknown module attribute
closest_things = (Parallel(n_jobs=12)(delayed(get_closest_thing)(i) for i in range(unknown.shape[0])))

In [108]:
closest_things

['PieceDetector', 'PieceDetector']