# Similarity / Distance between Devices

In [46]:
from aux import *

import pandas as pd
import numpy as np

from stumpy import mass
from thefuzz import fuzz
from joblib import Parallel, delayed
from datetime import datetime, timedelta

import json

import plotly.graph_objects as go
import matplotlib.pyplot as plt
plt.style.use('default')

In [16]:
#Available colors
colors = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
]

## Devices SDF definitions

### Adapt SDF definition data for analysis

In [17]:
# Load SDF files to compare
sdf_manager = SDFManager(path='../iot/sdf/')
sdfs, sdfs_dfs = sdf_manager.get_all_sdfs()
# Join sdf dfs into a single DataFrame
sdfs_df = pd.concat([value for key, value in sdfs_dfs.items()]).reset_index()

In [18]:
# Air Quality SDF DATAFRAME
sdfs_df[sdfs_df.thing=='AirQualityModified'].iloc[:,2:].reset_index(drop=True)

Unnamed: 0,thing_desc,obj,obj_desc,prop,prop_desc,prop_type,prop_unit
0,Monitors air quality through a set of sensors,temperature_humidity_sensor,Measures environmental temperature and humidity.,temperature,Temperature value,number,Cel
1,Monitors air quality through a set of sensors,temperature_humidity_sensor,Measures environmental temperature and humidity.,humidity,Humidity value,number,%
2,Monitors air quality through a set of sensors,air_quality_sensor,Measures air pollutants.,pm25,PM2.5 value,number,ug/m3
3,Monitors air quality through a set of sensors,air_quality_sensor,Measures air pollutants.,pm10,PM10 value,number,ug/m3


## Devices data

### Load devices data

In [58]:
# Read devices data samples from JSON
with open('devices.json', 'r') as f: devices = json.load(f)

### Adapt devices data for analysis

In [59]:
# Build devices dataframe
rows = []
for dev_uuid, dev in devices.items() :
    # Dev row initialization
    row = {
        'uuid': dev_uuid,
        'dev' : dev['name'],
        'integ': dev['integrated'],
        'period': dev['period']
    }
        
    # Create a row for each module attribute with a column for each value in the buffer
    for mod_uuid, mod in dev['modules'].items() :
        row['mod'] = mod['name']
        for prop_name, values in mod['attribs'].items() :
            row['attrib'] = prop_name
            for i, val in enumerate(values) : row[f'v{i+1}'] = val
            rows.append(row.copy())

# Build devices dataframe
devs_df = pd.DataFrame(rows)

## Similarity Analysis
Given an unknown device, determine the 5 closest devices according to SDF definition similarity, and then choose the closest device by including the buffered data. 

**Possible algorithm**

    Given an unknown device(thing) module(object) attribute(property), find 5 closest thing-object-property tuples according to SDF definition.

    Then, for each unknown module attribute, determine which attribute resembles the most to it of those in the 5 closest devices, and vote this device UUID as the candidate to be the closest.

    Finally, the device that has the most attribute votes is chosen as the closest device.

In [60]:
# Value columns names
val_cols = devs_df.columns[6:]

# Turn booleans into integers and cast all data to float
devs_df[val_cols] = devs_df[val_cols].replace('False',0).replace('True',1)
devs_df[val_cols] = devs_df[val_cols].astype(float)

In [61]:
# Air Quality Modified DEVICES DATAFRAME
devs_df[devs_df.dev=='AirQualityModified']

Unnamed: 0,uuid,dev,integ,period,mod,attrib,v1,v2,v3,v4,...,v350,v351,v352,v353,v354,v355,v356,v357,v358,v359
189,indoors_airqualitymod,AirQualityModified,True,180.421704,temperature_humidity_sensor,temperature,20.072877,20.038974,20.07613,20.181529,...,20.37119,20.355734,20.263135,20.345291,20.245669,20.344987,20.324011,20.351182,20.26147,20.26477
190,indoors_airqualitymod,AirQualityModified,True,180.421704,temperature_humidity_sensor,humidity,25.025036,25.0374,24.963606,25.057288,...,24.513077,24.724111,24.575977,24.6511,24.697084,24.699781,24.675769,24.71214,24.56495,24.506342
191,indoors_airqualitymod,AirQualityModified,True,180.421704,air_quality_sensor,pm25,8.391234,8.41238,8.530494,8.464863,...,8.910303,8.893579,8.866097,8.846951,8.833852,8.826487,8.806323,8.824612,8.811812,8.837786
192,indoors_airqualitymod,AirQualityModified,True,180.421704,air_quality_sensor,pm10,17.360588,17.344263,17.357747,17.404092,...,17.168674,17.131496,17.253328,17.174278,17.174932,17.134031,17.137606,17.118086,17.154203,17.252552


In [62]:
# Unknown device
name = 'AirQualityModified'

In [63]:
# Compute voting results df
def calc_voting_result_df(votes) :
    total_vote_sdf = {}
    for vote in votes:
        for candidate, score in vote.items() :
            if candidate not in total_vote_sdf :
                total_vote_sdf[candidate] = score
            else :
                total_vote_sdf[candidate] += score

    return pd.DataFrame(total_vote_sdf.items(),columns=['candidate','score']).sort_values(by='score',ascending=False)

### Determine closest devices by SDF comparison

String similarity of the fields:
- Thing and its description
- Object and its description
- Property and its description

In [64]:
noninteg_class = sdfs_df[sdfs_df.thing == name]
integ_classes = sdfs_df[sdfs_df.thing != name]

In [65]:
# Define distance functions
def calc_str_dist(descs, row):
    return fuzz.ratio(descs,row['prop'] + ' ' + row['prop_desc'])

# Compute closest classes by comparing SDF descriptions
def get_closest_classes(noninteg_class,integ_classes,i,score=3) :
    # Create local copies and compare only rows with same data type
    noninteg_class_row = noninteg_class.iloc[i].copy()
    integ_classes = integ_classes[integ_classes.prop_type==noninteg_class_row['prop_type']].copy()

    # Build non integrated row text description
    non_integ_class_row_desc = noninteg_class_row['prop'] + ' ' + noninteg_class_row['prop_desc']

    # Calc string distances to each other integrated row text description
    integ_classes['str_dist'] = integ_classes.apply(lambda x: calc_str_dist(non_integ_class_row_desc,x), axis=1)
    closest_things = integ_classes[['thing','obj','prop','str_dist']].sort_values(by='str_dist',ascending=False)

    # Give points based on closeness
    vote = {}
    for row in closest_things.itertuples() :
        if score == 0 : break
        if row.thing in vote : continue
        vote[row.thing] = score
        score -= 1

    return vote

In [66]:
# Compute Top 5 closest SDF classes
votes = (Parallel(n_jobs=12)(delayed(get_closest_classes)(noninteg_class,integ_classes,i) for i in range(noninteg_class.shape[0])))
voting_result_df = calc_voting_result_df(votes)
closest_classes = voting_result_df.candidate.iloc[0:5].tolist()
voting_result_df

Unnamed: 0,candidate,score
0,AirQuality,12
1,NoiseSensor,8
3,RainSensor,3
2,RepairControl,1


### Compare AirQuality Devices

In [67]:
# Select data to plot
dev1_uuid = 'indoors_airquality'
dev1_df = devs_df[devs_df.uuid == dev1_uuid]
dev1_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev1_uuid]['timestamps']]

dev2_uuid = 'c11c3f56-0f26-415f-a00d-3bb929f5ca20'
dev2_df = devs_df[devs_df.uuid == dev2_uuid]
dev2_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev2_uuid]['timestamps']]

dev3_uuid = 'indoors_airqualitymod'
dev3_df = devs_df[devs_df.uuid == dev3_uuid]
dev3_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev3_uuid]['timestamps']]

devs = [dev1_df,dev2_df,dev3_df]
ts = [dev1_timestamps,dev2_timestamps,dev3_timestamps]

In [68]:
names = []
fig = go.Figure()
for i, (dev, t) in enumerate(zip(devs,ts)) :
    for j, row in dev.iterrows() :
        dev_uuid = row.uuid
        dev_name, mod_name, attrib_name = row['dev'], row['mod'], row['attrib']
        name = row.uuid + '/' + dev_name + '/' + mod_name + '/' + attrib_name
        fig.add_trace(go.Scatter(x=t, y=row[val_cols],
                    mode='lines+markers',
                    name=name,
                    line = dict(color=colors[i])))
        names.append(name)
fig.show()

### Compare PickUpRobot Devices

In [69]:
# Select data to plot
dev1_uuid = 'bodyconfig_pickuprob'
dev1_df = devs_df[devs_df.uuid == dev1_uuid]
dev1_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev1_uuid]['timestamps']]

dev2_uuid = '6625b9ac-55e2-49c8-ab47-d1da21b5f0b5'
dev2_df = devs_df[devs_df.uuid == dev2_uuid]
dev2_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev2_uuid]['timestamps']]

dev3_uuid = 'bodyconfig_pickuprob2'
dev3_df = devs_df[devs_df.uuid == dev3_uuid]
dev3_timestamps = [datetime.strptime(timestamp,"%Y-%m-%dT%H:%M:%S.%f") for timestamp in devices[dev3_uuid]['timestamps']]

devs = [dev1_df,dev2_df,dev3_df]
ts = [dev1_timestamps,dev2_timestamps,dev3_timestamps]

In [70]:
names = []
fig = go.Figure()
for i, (dev, t) in enumerate(zip(devs,ts)) :
    for j, row in dev.iterrows() :
        dev_uuid = row.uuid
        dev_name, mod_name, attrib_name = row['dev'], row['mod'], row['attrib']
        name = row.uuid + '/' + dev_name + '/' + mod_name + '/' + attrib_name
        fig.add_trace(go.Scatter(x=t, y=row[val_cols],
                    mode='lines+markers',
                    name=name,
                    line = dict(color=colors[i+7])))
        names.append(name)
fig.show()

### Determine closest device UUID by buffered values comparison

Get the attribute with the closest values (euclidean distance) to each of the unknown device module attributes

In [71]:
pickup_uuid = 'bodyconfig_pickuprob2'
airqualitymod_uuid = 'indoors_airqualitymod'
uuid = airqualitymod_uuid # UUID of the unknown device

In [72]:
noninteg_dev = devs_df[devs_df.uuid == uuid]
integ_devs = devs_df[(devs_df.integ == True) & (devs_df.uuid != uuid)]

In [73]:
# Compute closest devices searching for closest time series pattern
def get_closest_devs(noninteg_dev,integ_devs,closest_classes,i,score=1) :
    # Create local copies
    noninteg_dev_row = noninteg_dev.iloc[i].copy()
    closest_classes.append(noninteg_dev_row.dev)
    integ_devs = integ_devs[integ_devs.dev.isin(closest_classes)].copy()
    val_cols = integ_devs.columns[6:]

    # Compute device with closest time series pattern
    min_dist_profile = np.Inf
    query_series = noninteg_dev_row[val_cols[:20]].astype(float).to_numpy()
    for i, integ_dev_row in integ_devs.iterrows() :
        inspected_series = integ_dev_row[val_cols].dropna().astype(float).to_numpy()
        if inspected_series.size < query_series.size : continue
        # MASS Distance Profile
        dist_profile = mass(query_series, inspected_series, normalize=False)
        if np.min(dist_profile) < min_dist_profile :
            min_dist_profile = np.min(dist_profile)
            candidate = integ_dev_row.dev + '/' + integ_dev_row.uuid
    
    # The winner is the one with lower distance
    return {candidate: score}
    

In [74]:
# Out of those 5 closest classes, get device that best matches time series pattern
votes = (Parallel(n_jobs=12)(delayed(get_closest_devs)(noninteg_dev,integ_devs,closest_classes,i) for i in range(noninteg_dev.shape[0])))
voting_result_df = calc_voting_result_df(votes)

In [75]:
voting_result_df

Unnamed: 0,candidate,score
0,AirQuality/indoors_airquality,4


## USE SIMILARITY ANALYSIS TO SOLVE CASES. 
### Case 1. A KNOWN DEVICE DISAPPEARS AND A NEW ONE WITH SIMILAR CHARACTERISTICS APPEARS
### Case 2. A COMPLEMENTARY DEVICE APPEARS IN A TASK

Explain what needs to be done to be able to tackle CASE 3. What others steps should we take?
Additionally, explain the weaknesses of our similarity comparison and how could these be tackled.