# Import

In [0]:
#!/usr/bin/env python3

import json
from collections import defaultdict
import statistics
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import argparse
from google.colab import files
from matplotlib.pyplot import figure
from google.colab import drive
import pandas as pd
import math
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Functions

In [0]:
def read_csv(filename):
  return pd.read_csv(filename, error_bad_lines=False, header=None, names = list(range(0,76)))

In [0]:
def get_all_APs(combined_data):
  # APdf contains AP addresses
  APdf = combined_data.loc[:, ::2]
  APdf.drop(APdf.columns[[0]], axis=1, inplace=True)

  # Get unique AP addresses and store them in APlist
  APlist = list()
  for (columnName, columnData) in APdf.iteritems():
    APlist = APlist + list(APdf[columnName].unique())
  APlist = set(APset)
  APlist = [x for x in APlist if str(x) != 'nan']
  print('Total number of APs in the given dataset: ' + str(len(APlist)))
  return APlist

In [0]:
def clean_data(df_toClean, column_names): 
  df_cleaned = pd.DataFrame()
  for i in range(df_toClean.shape[0]):
    newDfRow = []
    for AP in column_names:
      flag = 0
      for col in df_toClean.iloc[i]:
        # read the next comment
        if flag == 1:
          newDfRow.append(col) 
          break

        # if the AP is found in the current row, set flag, so that the signal strength of the AP can be added to df_cleaned
        if AP == col: 
          flag = 1

      # if the column is not present in the current row, impute -120 dBm
      if flag == 0: 
        newDfRow.append(-120)

    # Populate cleaned dataframe
    if i == 0:
      df_cleaned = pd.DataFrame(np.array([newDfRow]), columns=column_names)
    else:
      newDfTemp = pd.DataFrame(np.array([newDfRow]), columns=column_names)
      df_cleaned = df_cleaned.append(newDfTemp)
  return df_cleaned

In [0]:
def knn_est_lat_long(k_val):

  # Store the original and estimated co-ordinates in positionDf dataframe
  positionDf = pd.DataFrame()
  positionDf_col = ['lat','long','est_lat','est_long']

  for i in range(test_cleaned.shape[0]):

    # Find Euclidean Distance
    diffDf = train_cleaned.subtract(test_cleaned.iloc[i], axis=1)
    diffDf = np.square(diffDf)
    diffDf['sum'] = diffDf.sum(axis=1)
    diffDf.drop(diffDf.columns.difference(['sum']), 1, inplace=True)
    diffDf = diffDf.reset_index(drop = True) 

    # Filter the closest K-Nearest Neighbours
    neighDf = diffDf.nsmallest(k_val, 'sum', keep='all')

    # Estimate latitude & longitude values
    # Sum of kNN lat & long
    sum_lat = 0
    sum_long = 0
    for row in neighDf.index:
      sum_lat += train.iloc[row][0]
      sum_long += train.iloc[row][1]
    # Average
    est_lat = sum_lat/neighDf.shape[0]
    est_long = sum_long/neighDf.shape[0]

    # Update positionDf with the estimated co-ordinate values
    if i == 0:
      positionDf = pd.DataFrame(np.array([[test.iloc[i][0],test.iloc[i][1],est_lat,est_long]]), columns=positionDf_col)
    else:
      newDfTemp = pd.DataFrame(np.array([[test.iloc[i][0],test.iloc[i][1],est_lat,est_long]]), columns=positionDf_col)
      positionDf = positionDf.append(newDfTemp)
  print(positionDf)
  return positionDf

In [0]:
def calc_localization_error(positionDf):

  # Calculate distance between in original & estimated position co-ordinates in miles and km
  positionDf["diff_lat"] = positionDf["lat"] - positionDf["est_lat"]
  positionDf["diff_long"] = positionDf["long"] - positionDf["est_long"]
  a = np.square(np.sin(positionDf["diff_lat"]/2)) + np.cos(positionDf["lat"]) * np.cos(positionDf["est_lat"]) 
                                                      * np.square(np.sin(positionDf["diff_long"]/2))
  c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
  
  # Calculate distance in miles
  positionDf["distanceMiles"] = 3961 * c
  
  # Calculate distance in km
  positionDf["distanceKM"] = 6373 * c
  print(positionDf)
  return positionDf

In [0]:
def print_result(positionDf, knn_value):

  # Print result in miles
  print("When k = " + str(knn_value) + " Nearest Neighbours are considered:")
  print(" Median error: " + str(positionDf.distanceMiles.median()))
  print(" 67 percentile error: " + str(positionDf.distanceMiles.quantile(0.67)))
  print(" 90 percentile error: " + str(positionDf.distanceMiles.quantile(0.9)))
  print()

In [0]:
def print_result_km(positionDf, knn_value):
  
  # Print result in km
  print("When k = " + str(knn_value) + " Nearest Neighbours are considered:")
  print(" Median error: " + str(positionDf.distanceKM.median()))
  print(" 67 percentile error: " + str(positionDf.distanceKM.quantile(0.67)))
  print(" 90 percentile error: " + str(positionDf.distanceKM.quantile(0.9)))
  print()

# Part 1: Initial Data Processing

## Import csv

In [0]:
root_dir = '/content/drive/My Drive/WMN2/'
train = read_csv(root_dir + 'Training_data.csv')
test = read_csv(root_dir + 'Test_data.csv')
combined_data = train.append(test)

# print dataframe shapes
print(train.shape)
print(test.shape)
print(combined_data.shape)

(1194, 76)
(138, 76)
(1332, 76)


## Get the list of APs

In [0]:
APlist = get_all_APs(combined_data)

Total number of APs in the given dataset: 837


## Impute missing values

In [0]:
train_cleaned = clean_data(train, APlist)
test_cleaned = clean_data(test, APlist)

# print dataframe shapes
print(train_cleaned.shape)
print(test_cleaned.shape)

(1194, 837)
(138, 837)


# Part 2: Localization

## Find kNN & estimate latitude, longitude co-ordinates

In [0]:
positionDf_knn3 = knn_est_lat_long(3)
positionDf_knn4 = knn_est_lat_long(4)
positionDf_knn5 = knn_est_lat_long(5)

          lat       long    est_lat   est_long
0   40.906567 -73.108602  40.906571 -73.108614
0   40.906597 -73.108673  40.906605 -73.108687
0   40.906615 -73.108704  40.906625 -73.108720
0   40.906627 -73.108725  40.906630 -73.108728
0   40.906644 -73.108745  40.906641 -73.108741
..        ...        ...        ...        ...
0   40.906850 -73.108272  40.906854 -73.108281
0   40.906842 -73.108270  40.906842 -73.108271
0   40.906842 -73.108272  40.906842 -73.108271
0   40.906848 -73.108266  40.906847 -73.108266
0   40.906638 -73.108431  40.906650 -73.108426

[138 rows x 4 columns]
          lat       long    est_lat   est_long
0   40.906567 -73.108602  40.906574 -73.108625
0   40.906597 -73.108673  40.906606 -73.108689
0   40.906615 -73.108704  40.906625 -73.108720
0   40.906627 -73.108725  40.906625 -73.108720
0   40.906644 -73.108745  40.906650 -73.108750
..        ...        ...        ...        ...
0   40.906850 -73.108272  40.906850 -73.108271
0   40.906842 -73.108270  40.906842 

## Compute Localization Error

In [0]:
positionDf_knn3 = calc_localization_error(positionDf_knn3)
positionDf_knn4 = calc_localization_error(positionDf_knn4)
positionDf_knn5 = calc_localization_error(positionDf_knn5)

          lat       long    est_lat  ...     diff_long  distanceMiles  distanceKM
0   40.906567 -73.108602  40.906571  ...  1.289000e-05       0.053664    0.086342
0   40.906597 -73.108673  40.906605  ...  1.379667e-05       0.063255    0.101774
0   40.906615 -73.108704  40.906625  ...  1.579500e-05       0.073497    0.118252
0   40.906627 -73.108725  40.906630  ...  3.010000e-06       0.017432    0.028047
0   40.906644 -73.108745  40.906641  ... -3.743333e-06       0.017737    0.028538
..        ...        ...        ...  ...           ...            ...         ...
0   40.906850 -73.108272  40.906854  ...  8.443333e-06       0.035888    0.057741
0   40.906842 -73.108270  40.906842  ...  1.100000e-06       0.004518    0.007269
0   40.906842 -73.108272  40.906842  ... -5.500000e-07       0.002995    0.004818
0   40.906848 -73.108266  40.906847  ...  1.833333e-07       0.002713    0.004365
0   40.906638 -73.108431  40.906650  ... -5.786667e-06       0.051065    0.082161

[138 rows x 8 c

# Result

## Result in miles

In [0]:
print_result_miles(positionDf_knn3, 3)
print_result_miles(positionDf_knn4, 4)
print_result_miles(positionDf_knn5, 5)

When k = 3 Nearest Neighbours are considered:
 Median error: 0.05412130633831588
 67 percentile error: 0.07257826825888594
 90 percentile error: 0.1349382128040697

When k = 4 Nearest Neighbours are considered:
 Median error: 0.06651225517358414
 67 percentile error: 0.08843083221850023
 90 percentile error: 0.15952889777751433

When k = 5 Nearest Neighbours are considered:
 Median error: 0.07828930635986234
 67 percentile error: 0.10608027759309598
 90 percentile error: 0.17633717991532963



## Result in kilometers

In [0]:
print_result_km(positionDf_knn3, 3)
print_result_km(positionDf_knn4, 4)
print_result_km(positionDf_knn5, 5)

When k = 3 Nearest Neighbours are considered:
 Median error: 0.08707777967535651
 67 percentile error: 0.11677387114715478
 90 percentile error: 0.21710710179256154

When k = 4 Nearest Neighbours are considered:
 Median error: 0.10701403742015948
 67 percentile error: 0.14227965001981874
 90 percentile error: 0.2566719680727339

When k = 5 Nearest Neighbours are considered:
 Median error: 0.1259625724391322
 67 percentile error: 0.17067649813198704
 90 percentile error: 0.28371543741489413

