# Experiments with Bluetooth Low Energy Beacon Signals

## Imports

In [1]:
# Enable inline Matplotlib
%matplotlib inline

## Imports ##
# JSON
import json

# NumPy
import numpy as np

# Pandas
import pandas as pd
pd.plotting.register_matplotlib_converters()
pd.set_option('display.min_rows', 250)
pd.set_option('display.max_rows', 500)

# Matplotlib
import matplotlib.pyplot as plt

# SciPy
from scipy import stats

# seaborn
import seaborn as sns

# scikit-learn
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

## Constants and Helper Functions

In [2]:
# Constant that indicated how long each captured beacon be considered to be present (value in milliseconds).
# This effect can be simulated by a running average window over the collected data.
beacons_inactivity_timer=5000
# Constant that indicates how often the device reports about the surrounding beacons (value in milliseconds). 
# This effect can be simulated by resampling the collected data.  
beacons_refresh_interval=1000

# A function that receives the path to a JSON file containing a data collection run to load that file.
# It also receives a few extra to append to the loaded data which characterize the objective of the loaded data, i.e.:
# - The "target beacons" that we were interested in during the data collection phase.
# - The distance that those target beacons were from the data collection point.
def log_loader(file_path, target_beacons, distance):
    with open(file_path) as json_file:
        log = json.load(json_file)
        log['target_beacons'] = target_beacons
        log['distance'] = distance   
        return log

# This function receives a loaded of logs (see previous function), iterates over them, and unrolls them into a DataFrame.
def convert_logs_to_dataframe(logs):
    # List of per log intermediary DataFrames.
    dfs = []
    # Iterate over each log.
    for log in logs:
        # List of lists that represent a table that will be converted into a DataFrame.
        table = []
        # Iterate over the list of target beacons.
        for beacon in log['target_beacons']:
            # Iterate over the logging sessions contained within a log file.
            for entries in log['sessions'].values():
                # Iterate over the entries of each of those sessions.
                for entry in entries:
                    # Get each of the readings in the the entry.
                    reading = entry['reading']
                    # Check if the reading matches our target iBeacon.
                    if reading['type'] == 'iBeacon' and reading['values'][0] == beacon['uuid']\
                    and reading['values'][1] == beacon['major'] and reading['values'][2] == beacon['minor']:
                        # If so extract all the information into and append as row in the table
                        table.append([log['name'],
                                      pd.to_datetime(log['timestamp'], unit='ms'),
                                      entry['id'],
                                      reading['id'],
                                      reading['type'],
                                      str(reading['values']),
                                      reading['values'][0],
                                      reading['values'][1],
                                      reading['values'][2],
                                      reading['txPower'],
                                      reading['rssi'],
                                      pd.to_datetime(reading['timestamp'], unit='ms'),
                                      reading['avgRssi'],
                                      log['distance']])
        
        # After collection everything of interest in the log file into a plain "list of lists",
        # convert into a DataFrame by specifying the column names
        df = pd.DataFrame(table, columns=['filename',
                                          'creationTimestamp',
                                          'entry',
                                          'id',
                                          'type',
                                          'values',
                                          'uuid',
                                          'major',
                                          'minor',
                                          'txPower',
                                          'rssi',
                                          'timestamp',
                                          'avgRssi',
                                          'distance'])
        # If for some reason there are duplicate timestamps on a log file drop them.
        df.drop_duplicates(subset=['timestamp'], inplace=True)
        # Append the DataFrame to the list of DataFrames.
        dfs.append(df)
    # Return the single DataFrame that results from the concatenation of the individual DataFrames.
    # Also reset the index and select only the relevant columns.
    return pd.concat(dfs, ignore_index=True)[['timestamp','values', 'distance', 'rssi', 'avgRssi']]

# A function that should be applied per each device/distance combo, assuming that this corresponds to one and only one of
# the previously loaded log files.
def per_group(d):
    # Use the inverse of the zscore as the weight for each of the samples
    d['weight'] = 1 / np.abs(stats.zscore(d['rssi']))
    # However, the z-score result can be 0 so 1/0 will result in infinity. We need to replace those values by something else.
    # For now, we simply indentify what is the largest value other than infinity and set all the instances of infinity by
    # a number that is slightly larger than that (1% to be exact).
    d.replace(np.inf, d.loc[d.weight != np.inf, 'weight'].max()*1.01, inplace=True)
    # Calculate a running average over the RSSI values to filter out undesired signal strength fluctuation. 
    # In practive, this type of filtering needs to reach a trade-off between stopping abrupt signal changes and 
    # responsiveness to user moveiment.
    d['rolling_mean_rssi'] = d[['timestamp', 'rssi']].rolling(pd.Timedelta(beacons_inactivity_timer, unit='ms'), on='timestamp')['rssi'].mean()
    # Return the DataFrame slice to be merged with all the others.
    return d

def resample_group(d, sample_interval=beacons_refresh_interval):
    r = d.set_index('timestamp', verify_integrity=True).resample(pd.Timedelta(sample_interval, unit='ms'), label='right', closed='right').pad()
    r['timestamp_diff'] = r.index
    r['timestamp_diff'] = r['timestamp_diff'].diff()
    return r

## Load Data

In [None]:
# Load Data
# Generate the list of distances at which we collected samples (i.e., 0 to 10 meters in 0.5 meters intervals)
distances = np.linspace(0,10,21)
# A dictionary that describes our data collection task.
collections = {
    # A laptop
    'laptop': {
        # collected samples at theses distances
        'distances': distances,
        # from the following targets:
        'targets': [{ 'uuid': '113069EC-6E64-4BD3-6810-DE01B36E8A3E', 'major': 1, 'minor': 102 }]
    },
    # A smartphone
    'smartphone':  {
        # collected samples at these distances
        'distances': distances,
        # from the following targets:
        'targets': [{ 'uuid': '113069EC-6E64-4BD3-6810-DE01B36E8A3E', 'major': 1, 'minor': 101 }]
    }
}

# Initialize a dictionary that will store the dataframes for each of the devices, indexed by the device name.
tables = {}
# Iterate over the data collection task defined above
for name, device in collections.items():
    # Initialize a list to collect the logs for each of the distances.
    logs = []
    # Iterate over each of the distances collected for the device.
    for d in device['distances']:
        # Load the corresponding file based on the following pattern
        logs.append(log_loader('data/beacons/'+name+'/beacons-'+str(d)+'.json', device['targets'], d))
    # Conver the list of logs to a DataFrame and story in the tables dictionary indexed by the device name.
    tables[name] = convert_logs_to_dataframe(logs)
    # Add a column to the DataFrame which identifies each of each lines as belonging to a certain device.
    tables[name]['device'] = name

# Concatenate the DataFrames for each device into a single DataFrame (recreate the index while at it).
data = pd.concat(tables.values(), ignore_index=True)
# Group the data by Device and Distance, apply the "per_collector_device_distance" function.
# NOTE: This is needed to perform a few extra computations for each of the logging sessions. I could have probably done this
# while loading the data but I decided to keep things separate. However, this may need to be changed if in the future
# grouping turns out not to be enough to slice the data on "per log" basis.
data = data.groupby(['device','distance']).apply(per_group).reset_index(drop=True)

# Save the DataFrame to a CSV file.
data.to_csv('out/beacons.csv')
# Save the DataFrame to an Excel file
data.to_excel('out/beacons.xlsx')
# Display the DataFrame
data

## Linear Regression

In [None]:
data_plot = data #[data.rssi_zscore < 3]
data_plot_laptop = data_plot[data_plot.device == 'laptop']
data_plot_smartphone = data_plot[data_plot.device == 'smartphone']

X = data_plot['rolling_mean_rssi'].values.reshape(-1,1)
X_weight = data_plot['weight']
y = data_plot['distance'].values.reshape(-1,1)

X_laptop = data_plot_laptop['rolling_mean_rssi'].values.reshape(-1,1)
y_laptop = data_plot_laptop['distance'].values.reshape(-1,1)

X_smartphone = data_plot_smartphone['rolling_mean_rssi'].values.reshape(-1,1)
y_smartphone = data_plot_smartphone['distance'].values.reshape(-1,1)

data_plot_smartphone_rssi_mean = data_plot_smartphone.groupby(['device','distance'])['distance','rssi'].mean()
data_plot_laptop_rssi_mean = data_plot_laptop.groupby(['device','distance'])['distance','rssi'].mean()

linear_regression_standard_scaler = StandardScaler()
linear_regression = LinearRegression()

weighted_linear_regression_standard_scaler = StandardScaler()
weighted_linear_regression = LinearRegression()

linear_regression_pipeline = Pipeline([
    ('scaler', linear_regression_standard_scaler),
    ('regression', linear_regression)
])
linear_regression_pipeline.fit(X, y)

weighted_linear_regression_pipeline = Pipeline([
    ('scaler', weighted_linear_regression_standard_scaler),
    ('regression', weighted_linear_regression)
])
weighted_linear_regression_pipeline.fit(X, y, regression__sample_weight = X_weight)

linear_regression_predictions = linear_regression_pipeline.predict(X)
weighted_linear_regression_predictions = weighted_linear_regression_pipeline.predict(X)

plt.figure(figsize=(16,9))

plt.scatter(y_smartphone, X_smartphone, c='#ff00003f', label='Smartphone')
plt.scatter(y_laptop, X_laptop,c='#00ff003f', label='Laptop')
plt.scatter(data_plot_smartphone_rssi_mean.distance,
            data_plot_smartphone_rssi_mean.rssi,
            c='#ffff00', label='Smartphone RSSI Mean')
plt.scatter(data_plot_laptop_rssi_mean.distance,
            data_plot_laptop_rssi_mean.rssi,
            c='#0000ff', label='Laptop RSSI Mean')

plt.plot(linear_regression_predictions, X, color='red')
plt.plot(weighted_linear_regression_predictions, X, color='blue')

plt.xlim(min(y)-1, max(y)+1)
plt.ylim(min(X)-1, max(X)+1)

plt.xticks(np.arange(min(y), max(y)+1, 1.0))
plt.yticks(np.arange(min(X), max(X)+1, 2.0))

plt.xlabel('Distance (m)')
plt.ylabel('RSSI (dB)')

plt.grid(True)
plt.legend(loc='upper right');

plt.show()

linear_regression_r2 = linear_regression.score(X, y)
weighted_linear_regression_r2 = weighted_linear_regression.score(X, y, data_plot['weight'])
# R2 Score
print("Linear Regression R2 Score", linear_regression_r2)
print("Weighted Linear Regression R2 Score", weighted_linear_regression_r2)

# Equations
print('Linear Regreation: y =',str(linear_regression.coef_[0][0])+'x +',linear_regression.intercept_[0])
print('Weighted Linear Regreation: y =',str(weighted_linear_regression.coef_[0][0])+'x +',weighted_linear_regression.intercept_[0])

### Preparing Data for Cross Validation

In [None]:
random_state = 656
y_labels = data['distance'].astype(str).values.reshape(-1,1)

crossval = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state).split(X, y_labels)
scorer = make_scorer(mean_absolute_error)
linear_regression_scores = pd.DataFrame(cross_val_score(linear_regression_pipeline, X, y,
                                                        cv=crossval, scoring=scorer),
                                        columns=['scores'])
print('Linear Regression')
print(linear_regression_scores)
print(linear_regression_scores.describe())

crossval = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state).split(X, y_labels)
weighted_linear_regression_scores = pd.DataFrame(cross_val_score(weighted_linear_regression_pipeline, X, y,
                                                                 cv=crossval, scoring=scorer,
                                                                 fit_params = { 'regression__sample_weight': X_weight }),
                                        columns=['scores'])
print('Weighted Linear Regression')
print(weighted_linear_regression_scores)
print(weighted_linear_regression_scores.describe())

## Raw Data Visualization

### "General" Boxplot

In [None]:
# Using Seaborn Boxplot to get an overview of the whole distance vs. rssi correlation.
# NOTE: I could have possibly made this with plain Matplotlib but Seaborn makes it prettier and easier.
fig = plt.figure(figsize=(16,9))
sns.boxplot(data['distance'], data['rssi'])
plt.show()

### RSSI over time per distance and device

In [None]:
min_rssi = data['rssi'].min()
max_rssi = data['rssi'].max()

for k, d in data.groupby(['distance','device']):
    dr = resample_group(d, 1000)
    plt.figure(figsize=(8, 4))
    plt.title('Device: '+k[1]+' Distance: '+str(k[0])+' m')
    plt.xlabel('Time')
    plt.ylabel('RSSI (dB)')
    plt.plot(d['timestamp'], d['rssi'], label='RSSI', color='blue')
    plt.plot(dr.index, dr['rolling_mean_rssi'], label='Rolling Mean RSSI', color='red')
    plt.yticks(np.arange(min_rssi, max_rssi, 2.0))
    plt.legend(loc='lower right');
    plt.grid()
    plt.show()

### Resample Data

In [None]:
# Just resampling the data for future use (still nothing to do with it right now)
data_resample = data.groupby(['device','distance']).apply(resample_group).reset_index(drop=True)
# Display the result
data_resample