# CruncherLab #

## Import Modules ##

In [162]:
import getopt
import os
import sys

from sklearn import preprocessing

from yanux.cruncher.model.loader import JsonLoader
from yanux.cruncher.model.wifi import WifiLogs
from yanux.cruncher.ml.experiments import *

## Initialize Input & Output Data Directories and other parameters ##

In [163]:
input_data_directory = "data"
output_data_directory = "out"

print("Input Data Directory is:", input_data_directory)
print("Output Data Directory is", output_data_directory)

Input Data Directory is: data
Output Data Directory is out


### Create the output directory if it doesn't exist ###

In [164]:
if not os.path.exists(output_data_directory):
    os.makedirs(output_data_directory)

## Load Data from the Input Data Directory ##

In [165]:
json_loader = JsonLoader(input_data_directory)
wifi_logs = WifiLogs(json_loader.json_data)

Store the data into a Pandas Dataframe, in which each Wi-Fi result reading is represented by a single line

In [166]:
wifi_results_columns = ["filename", "x", "y", "floor", "orientation", "sample_id", "mac_address",
                        "timestamp", "signal_strength"]

wifi_results = pd.DataFrame(wifi_logs.wifi_results(), columns=wifi_results_columns)
wifi_results.to_csv(output_data_directory + "/wifi_results.csv")

Identify the unique MAC Addresses present in the recorded data. Each one represents a single Wi-Fi Access Point.

In [167]:
mac_addresses = wifi_results.mac_address.unique()

Similarly, store the data into a Pandas Dataframe in which each line represents a single sampling cycle with *n* different readings for each of the Access Points within range. Those readings are stored as columns along each sample.

In [168]:
wifi_samples_columns = ["filename", "x", "y", "floor", "orientation", "sample_id", "timestamp"]
wifi_samples_columns.extend(mac_addresses)

wifi_samples = pd.DataFrame(wifi_logs.wifi_samples(), columns=wifi_samples_columns)
wifi_samples = wifi_samples.sort_values(["filename", "x", "y", "floor", "sample_id"]).reset_index(drop=True)
wifi_samples.to_csv(output_data_directory + "/wifi_samples.csv")

## Analyzing the Data Set ##

# *** TODO *** #

## Generate Train and Test Scenario ##

Initialize the train and test scenario generation parameters

In [169]:
raw = True
groupby_mean = False
groupby_max = False
groupby_min = False
data_partials = [0.5, 0.15]
test_data_partials = [0.5, 0.15]
filename_prefixes = ["point", "altPoint"]
subset_locations_values = [0.24]

In [170]:
print("Generating Training and Test Data...")
data_scenarios = {}
test_data_scenarios = {}
prepare_full_data_scenarios(wifi_samples, data_scenarios,
                            raw=raw,
                            groupby_mean=groupby_mean,
                            groupby_max=groupby_max,
                            groupby_min=groupby_min)
prepare_full_data_scenarios(wifi_samples, test_data_scenarios,
                            raw=raw,
                            groupby_mean=groupby_mean,
                            groupby_max=groupby_max,
                            groupby_min=groupby_min)

# prepare_partial_data_scenarios(wifi_samples, data_scenarios,
#                                slice_at_the_end=False,
#                                raw=raw,
#                                groupby_mean=groupby_mean,
#                                groupby_max=groupby_max,
#                                groupby_min=groupby_min,
#                                partials=data_partials)
# prepare_partial_data_scenarios(wifi_samples, test_data_scenarios,
#                                slice_at_the_end=True,
#                                raw=raw,
#                                groupby_mean=groupby_mean,
#                                groupby_max=groupby_max,
#                                groupby_min=groupby_min,
#                                partials=test_data_partials)

# for filename_prefix in filename_prefixes:
#     prepare_filename_startswith_data_scenarios(wifi_samples, data_scenarios,
#                                                raw=raw,
#                                                groupby_mean=groupby_mean,
#                                                groupby_max=groupby_max,
#                                                groupby_min=groupby_min,
#                                                filename_startswith=filename_prefix)
# for filename_prefix in filename_prefixes:
#     prepare_filename_startswith_data_scenarios(wifi_samples, test_data_scenarios,
#                                                raw=raw,
#                                                groupby_mean=groupby_mean,
#                                                groupby_max=groupby_max,
#                                                groupby_min=groupby_min,
#                                                filename_startswith=filename_prefix)

# for subset_locations in subset_locations_values:
#     prepare_full_data_scenarios(subset_wifi_samples_locations(wifi_samples, subset_locations), data_scenarios,
#                                 raw=raw,
#                                 groupby_mean=groupby_mean,
#                                 groupby_max=groupby_max,
#                                 groupby_min=groupby_min,
#                                 scenarios_suffix="subset_locations=" + str(subset_locations))

# path_direction_aggregated_data_scenarios(wifi_samples, data_scenarios,
#                                          groupby_mean=groupby_mean,
#                                          groupby_max=groupby_max,
#                                          groupby_min=groupby_min)

save_scenarios(data_scenarios, output_directory=output_data_directory, prefix="train_")
print("# Data Scenarios: " + str(len(data_scenarios)))
save_scenarios(test_data_scenarios, output_directory=output_data_directory, prefix="test_")
print("# Test Scenarios: " + str(len(test_data_scenarios)))

Generating Training and Test Data...
# Data Scenarios: 1
# Test Scenarios: 1


## Playground ##

### # Neighbors ###
Test how the *k* value influences performance metrics

In [171]:
n_neighbors=range(1,11,2)
weights="uniform"
metric="euclidean"
nan_filler=-100

curr_data = data_scenarios["full_data"].fillna(nan_filler)
curr_test_data = test_data_scenarios["full_data"].fillna(nan_filler)

# Just a metrics accumulator
metrics = []
for k in n_neighbors:
    curr_metrics = experiment_metrics(knn_experiment(curr_data,
                                                     mac_addresses,
                                                     ["x", "y"],
                                                     algorithm="brute",
                                                     n_neighbors=k,
                                                     weights=weights,
                                                     metric=metric,
                                                     test_data=curr_test_data))
    curr_metrics["k"] = "k="+str(k)
    metrics.append(curr_metrics)

cols = ["k"] + list(curr_metrics.keys())[:-1]
metrics_table = pd.DataFrame(metrics, columns=cols)
metrics_table

Unnamed: 0,k,mean_absolute_error,std_dev_distance_error,mean_squared_error,percentile_25,percentile_50,percentile_75,percentile_90,percentile_95,min,max
0,k=1,2.67934,1.347943,8.994,2.0,2.0,4.0,4.0,6.0,1.581139,8.0
1,k=3,2.400288,1.318256,7.497444,2.0,2.0,3.333333,4.0,4.666667,0.0,9.333333
2,k=5,2.362713,1.322088,7.32858,1.581139,2.0,3.2,4.0,4.8,0.0,9.2
3,k=7,2.345407,1.308725,7.21198,1.428571,2.0,3.142857,4.0,4.571429,0.0,9.428571
4,k=9,2.325253,1.313599,7.130617,1.555556,2.0,3.111111,4.0,4.666667,0.0,9.555556


### Weights ###
Check whether the neighbors should have the same (*uniform*) or a weighted (*distance*-based) influence in the regression result.

In [172]:
n_neighbors=range(2,6,1)
weights=["uniform", "distance"]
metric="euclidean"
nan_filler=-100

curr_data = data_scenarios["full_data"].fillna(nan_filler)
curr_test_data = test_data_scenarios["full_data"].fillna(nan_filler)

# Just a metrics accumulator
metrics = []
for k in n_neighbors:
    for w in weights:
        curr_metrics = experiment_metrics(knn_experiment(curr_data,
                                                         mac_addresses,
                                                         ["x", "y"],
                                                         algorithm="brute",
                                                         n_neighbors=k,
                                                         weights=w,
                                                         metric=metric,
                                                         test_data=curr_test_data))
        curr_metrics["k"] = k
        curr_metrics["weights"] = w
        metrics.append(curr_metrics)

cols = ["k","weights"] + list(curr_metrics.keys())[:-1]
metrics_table = pd.DataFrame(metrics, columns=cols)
metrics_table

Unnamed: 0,k,weights,mean_absolute_error,std_dev_distance_error,mean_squared_error,percentile_25,percentile_50,percentile_75,percentile_90,percentile_95,min,max,k.1
0,2,uniform,2.428117,1.382673,7.805625,2.0,2.0,3.0,4.0,5.0,0.0,9.0,2
1,2,distance,2.433114,1.374071,7.806227,2.0,2.0,3.014315,4.0,5.019448,0.001178,8.99913,2
2,3,uniform,2.400288,1.318256,7.497444,2.0,2.0,3.333333,4.0,4.666667,0.0,9.333333,3
3,3,distance,2.399842,1.314283,7.484855,2.0,2.0,3.219515,4.0,4.731515,0.004412,9.326577,3
4,4,uniform,2.383901,1.326683,7.441313,1.581139,2.0,3.0,4.0,5.0,0.0,9.5,4
5,4,distance,2.384371,1.317142,7.418354,1.581139,2.0,3.015086,4.0,4.995867,0.012303,9.488821,4
6,5,uniform,2.362713,1.322088,7.32858,1.581139,2.0,3.2,4.0,4.8,0.0,9.2,5
7,5,distance,2.361716,1.318441,7.314252,1.581139,2.0,3.186078,4.0,4.816928,0.001617,9.204335,5


### Metric ###
Just test a few different distance metrics to assess if there is a better alternative than the plain old *euclidean* distance. The tested metrics include:
- Euclidean Distance
    - sqrt(sum((x - y)^2))
- Manhattan Distance
    - sum(|x - y|) 
- Chebyshev Distance
    - sum(max(|x - y|))
- Hamming Distance
    - N_unequal(x, y) / N_tot
- Canberra Distance
    - sum(|x - y| / (|x| + |y|))
- Braycurtis Similarity
    - sum(|x - y|) / (sum(|x|) + sum(|y|))
- S Euclidean Distance
    - sqrt(sum((x - y)^2 / V))
- Mahalanobis Distance
    - sqrt((x - y)' V^-1 (x - y))

The possible arguments are the following:
- p = The order of the norm of the difference
- V = array_like symmetric positive-definite covariance matrix.
- w = (N,) array_like weight vector.

In [173]:
n_neighbors=3
weights="uniform"
metric=["euclidean","manhattan", "chebyshev",
        "hamming", "canberra", "braycurtis",
        "seuclidean", "mahalanobis"]
nan_filler=-100

curr_data = data_scenarios["full_data"].fillna(nan_filler)
curr_test_data = test_data_scenarios["full_data"].fillna(nan_filler)

# Just a metrics accumulator
metrics = []
for m in metric:
    if metric in ["mahalanobis", "seuclidean"]:
        metric_params = {'V': np.cov(curr_data[mac_addresses])}
    else:
        metric_params = None
    curr_metrics = experiment_metrics(knn_experiment(curr_data,
                                                     mac_addresses,
                                                     ["x", "y"],
                                                     algorithm="brute",
                                                     n_neighbors=n_neighbors,
                                                     weights=weights,
                                                     metric=m,
                                                     metric_params=metric_params,
                                                     test_data=curr_test_data))
    curr_metrics["metric"] = m
    metrics.append(curr_metrics)

cols = ["metric"] + list(curr_metrics.keys())[:-1]
metrics_table = pd.DataFrame(metrics, columns=cols)
metrics_table

Unnamed: 0,metric,mean_absolute_error,std_dev_distance_error,mean_squared_error,percentile_25,percentile_50,percentile_75,percentile_90,percentile_95,min,max
0,euclidean,2.400288,1.318256,7.497444,2.0,2.0,3.333333,4.0,4.666667,0.0,9.333333
1,manhattan,2.343701,1.345439,7.301333,1.581139,2.0,3.333333,4.0,4.666667,0.0,8.666667
2,chebyshev,2.819778,1.867725,11.436056,2.0,2.0,3.613029,5.333333,6.0,0.0,12.0
3,hamming,3.000341,2.160378,13.664611,1.581139,2.666667,4.0,6.0,7.333333,0.0,17.333333
4,canberra,2.403883,1.429142,7.819056,2.0,2.0,3.333333,4.0,5.333333,0.0,10.0
5,braycurtis,2.349035,1.352972,7.346667,1.581139,2.0,3.333333,4.0,4.666667,0.0,8.666667
6,seuclidean,2.398519,1.284537,7.401278,2.0,2.0,3.333333,4.0,4.666667,0.0,8.666667
7,mahalanobis,3.770477,3.839462,28.943222,2.0,2.666667,4.666667,6.666667,10.021749,0.0,31.941787


### NaN filler values ###

Test which is the signal strength value that should be considered for Access Points that are currently out of range. This is needed as part of the process of computing the distance/similarity between different fingerprints.

In [174]:
n_neighbors=3
weights="uniform"
metric="euclidean"
nan_filler = [-1000000, -100, 0, 100, 1000000,
              data_scenarios["full_data"][mac_addresses].min().min()-1] 

# Just a metrics accumulator
metrics = []
for nf in nan_filler:
    curr_data = data_scenarios["full_data"].fillna(nf)
    curr_test_data = test_data_scenarios["full_data"].fillna(nf)
    curr_metrics = experiment_metrics(knn_experiment(curr_data,
                                                     mac_addresses,
                                                     ["x", "y"],
                                                     algorithm="brute",
                                                     n_neighbors=n_neighbors,
                                                     weights=weights,
                                                     metric=metric,
                                                     test_data=curr_test_data))
    curr_metrics["nan_filler"] = nf
    metrics.append(curr_metrics)

cols = ["nan_filler"] + list(curr_metrics.keys())[:-1]
metrics_table = pd.DataFrame(metrics, columns=cols)
metrics_table

Unnamed: 0,nan_filler,mean_absolute_error,std_dev_distance_error,mean_squared_error,percentile_25,percentile_50,percentile_75,percentile_90,percentile_95,min,max
0,-1000000.0,2.666306,1.488223,9.321778,2.0,2.0,3.887301,4.666667,6.0,0.0,9.333333
1,-100.0,2.400288,1.318256,7.497444,2.0,2.0,3.333333,4.0,4.666667,0.0,9.333333
2,0.0,2.673226,1.552882,9.555167,2.0,2.0,3.333333,4.666667,6.0,0.0,10.413666
3,100.0,2.692669,1.590058,9.776222,2.0,2.0,3.333333,4.666667,6.0,0.0,11.794537
4,1000000.0,2.733955,1.640245,10.162222,2.0,2.0,3.399346,4.666667,6.0,0.0,11.794537
5,-93.0,2.32797,1.289616,7.080889,2.0,2.0,2.666667,4.0,4.666667,0.0,9.333333


### Units ###
- dBm
- mW

In [175]:
n_neighbors=3
weights="uniform"
metric="euclidean"
nan_filler=-100

# Just a metrics accumulator
metrics = []

# Use the directly measured dBm values
curr_data = data_scenarios["full_data"].fillna(nan_filler)
curr_test_data = test_data_scenarios["full_data"].fillna(nan_filler)
curr_metrics = experiment_metrics(knn_experiment(curr_data,
                                                 mac_addresses,
                                                 ["x", "y"],
                                                 algorithm="brute",
                                                 n_neighbors=n_neighbors,
                                                 weights=weights,
                                                 metric=metric,
                                                 test_data=curr_test_data))
curr_metrics["units"] = "dBm"
metrics.append(curr_metrics)

# Convert to mW
curr_data[mac_addresses] = convert_to_units(curr_data[mac_addresses], from_units="dBm", to_units="mW")
curr_test_data[mac_addresses] = convert_to_units(curr_test_data[mac_addresses], from_units="dBm", to_units="mW")
curr_metrics = experiment_metrics(knn_experiment(curr_data,
                                                 mac_addresses,
                                                 ["x", "y"],
                                                 algorithm="brute",
                                                 n_neighbors=n_neighbors,
                                                 weights=weights,
                                                 metric=metric,
                                                 test_data=curr_test_data))
curr_metrics["units"] = "mW"
metrics.append(curr_metrics)

    
cols = ["units"] + list(curr_metrics.keys())[:-1]
metrics_table = pd.DataFrame(metrics, columns=cols)
metrics_table

Unnamed: 0,units,mean_absolute_error,std_dev_distance_error,mean_squared_error,percentile_25,percentile_50,percentile_75,percentile_90,percentile_95,min,max
0,dBm,2.400288,1.318256,7.497444,2.0,2.0,3.333333,4.0,4.666667,0.0,9.333333
1,mW,4.311673,3.120495,28.318278,2.0,3.333333,6.0,9.333333,10.0,0.0,22.769375


### Scaling ###

In [176]:
n_neighbors=3
weights="uniform"
metric="euclidean"
nan_filler = -100

scaler_values = {"None": None,
                 "MinMaxScaler": preprocessing.MinMaxScaler(),
                 "StandardScaler": preprocessing.StandardScaler(),
                 "RobustScaler": preprocessing.RobustScaler(),
                 "NormalizerEuclidean": preprocessing.Normalizer(norm="l2"),
                 "NormalizerManhattan": preprocessing.Normalizer(norm="l1")}



# Just a metrics accumulator
metrics = []



for scaler_name, scaler in scaler_values.items():
    curr_data = data_scenarios["full_data"].fillna(nan_filler)
    curr_test_data = test_data_scenarios["full_data"].fillna(nan_filler)
    if scaler is not None:
        scaler.fit(curr_data[mac_addresses])
        curr_data[mac_addresses] = pd.DataFrame(scaler.transform(curr_data[mac_addresses]), columns=mac_addresses)
        curr_test_data[mac_addresses] = pd.DataFrame(scaler.transform(curr_test_data[mac_addresses]), columns=mac_addresses)
    
    curr_metrics = experiment_metrics(knn_experiment(curr_data,
                                                     mac_addresses,
                                                     ["x", "y"],
                                                     algorithm="brute",
                                                     n_neighbors=n_neighbors,
                                                     weights=weights,
                                                     metric=metric,
                                                     metric_params=metric_params,
                                                     test_data=curr_test_data))
    curr_metrics["scaler"] = scaler_name
    metrics.append(curr_metrics)

cols = ["scaler"] + list(curr_metrics.keys())[:-1]
metrics_table = pd.DataFrame(metrics, columns=cols)
metrics_table

Unnamed: 0,scaler,mean_absolute_error,std_dev_distance_error,mean_squared_error,percentile_25,percentile_50,percentile_75,percentile_90,percentile_95,min,max
0,MinMaxScaler,2.347341,1.292369,7.178556,2.0,2.0,3.333333,4.0,4.666667,0.0,10.0
1,RobustScaler,2.458955,1.38211,7.954778,2.0,2.0,3.333333,4.0,5.333333,0.0,8.02773
2,StandardScaler,2.398519,1.284537,7.401278,2.0,2.0,3.333333,4.0,4.666667,0.0,8.666667
3,,2.400288,1.318256,7.497444,2.0,2.0,3.333333,4.0,4.666667,0.0,9.333333
4,NormalizerEuclidean,2.398066,1.360154,7.598889,2.0,2.0,3.333333,4.0,4.666667,0.0,10.0
5,NormalizerManhattan,2.402366,1.352824,7.599667,2.0,2.0,3.333333,4.0,4.666667,0.0,10.0


### Different Data Scenarios ###

## Parameter Sweeping ##

Initialize some variables with the values of each parameter that is going to be swept.

In [177]:
k_neighbors_values = range(1,2)
weights_values = ["uniform", "distance"]
metric_values = ["euclidean", "manhattan", "chebyshev", "canberra", "braycurtis"]
nan_filler_values = [-100.0, -100000.0]
units_values = ["dBm", "mW"]
scaler_values = {"None": None,
                 "MinMaxScaler": preprocessing.MinMaxScaler(),
                 "StandardScaler": preprocessing.StandardScaler(),
                 "RobustScaler": preprocessing.RobustScaler(),
                 "NormalizerEuclidean": preprocessing.Normalizer(norm="l2"),
                 "NormalizerManhattan": preprocessing.Normalizer(norm="l1")}

Do the actual parameter sweeping and keep track of the metrics for each parameter combination.

In [178]:
scenarios = []
scenario_keys = None
for k_neighbors in k_neighbors_values:
    for weights in weights_values:
        for metric in metric_values:
            for nan_filler in nan_filler_values:
                for units in units_values:
                    for scaler_name, scaler in scaler_values.items():
                        for data_scenario, data in data_scenarios.items():
                            for test_data_scenario, test_data in test_data_scenarios.items():
                                if k_neighbors < len(data):
#                                     print("train_data =", data_scenario)
#                                     print("test_data =", test_data_scenario)
#                                     print("train_data_size =", len(data))
#                                     print("test_data_size =", len(test_data))
#                                     print("algorithm =", "KNeighborsRegressor")
#                                     print("n_neighbors =", k_neighbors)
#                                     print("weights =", weights)
#                                     print("metric =", metric)
#                                     print("nan_filler =", nan_filler)
#                                     print("units =", units)
#                                     print("scaler =", scaler_name)
#                                     print("----------------------------------------------------------------")
                                    print(".", end='')
                                    scenario = collections.OrderedDict([("train_data", data_scenario),
                                                                        ("test_data", test_data_scenario),
                                                                        ("train_data_size", len(data)),
                                                                        ("test_data_size", len(test_data)),
                                                                        ("algorithm", "KNeighborsRegressor"),
                                                                        ("n_neighbors", k_neighbors),
                                                                        ("weights", weights),
                                                                        ("metric", metric),
                                                                        ("nan_filler", nan_filler),
                                                                        ("units", units),
                                                                        ("scaler", scaler_name)])
                                    curr_data = data.fillna(nan_filler)
                                    curr_test_data = test_data.fillna(nan_filler)
                                    curr_data[mac_addresses] = convert_to_units(curr_data[mac_addresses],
                                                                                from_units="dBm",
                                                                                to_units=units)
                                    curr_test_data[mac_addresses] = convert_to_units(curr_test_data[mac_addresses],
                                                                                     from_units="dBm",
                                                                                     to_units=units)
                                    if scaler is not None:
                                        scaler.fit(curr_data[mac_addresses])
                                        curr_data[mac_addresses] = pd.DataFrame(scaler.transform(curr_data[mac_addresses]),
                                                                                columns=mac_addresses)
                                        curr_test_data[mac_addresses] = pd.DataFrame(scaler.transform(curr_test_data[mac_addresses]),
                                                                                     columns=mac_addresses)
                                    scenario.update(experiment_metrics(knn_experiment(curr_data,
                                                                                      mac_addresses,
                                                                                      ["x", "y"],
                                                                                      algorithm="brute",
                                                                                      n_neighbors=k_neighbors,
                                                                                      weights=weights,
                                                                                      metric=metric,
                                                                                      test_data=curr_test_data)))
                                    scenario_keys = scenario.keys()
                                    scenarios.append(scenario)

print("\n"+str(len(scenarios))+" scenarios have been simulated.")

................................................................................................................................................................................................................................................
240 scenarios have been simulated.


Save the metrics to disk for further analysis.

In [179]:
metrics = pd.DataFrame(scenarios, columns=scenario_keys)
metrics.to_csv(output_data_directory + "/metrics.csv")