In [1]:
# Python >= 3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn >= 0.20 is required
import sklearn 
assert sklearn.__version__ >= "0.20"

# common imports
import pandas as pd
import numpy as np 
import os 

# to plot pretty figures
%matplotlib inline 
import matplotlib as mpl 
import matplotlib.pyplot as plt 
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
from imutils import paths

# define lists of csv files
train_csvs = []
val_csvs = []
test_csvs = []

# loop over floors' ids
for i in [4, 5, 6]:
    # construct paths to train, validation, and test data
    train_paths = os.path.join("dataset", "{}".format(i), "train")
    val_paths = os.path.join("dataset", "{}".format(i), "val")
    test_paths = os.path.join("dataset", "{}".format(i), "test")
    
    # add paths to the csv files to the lists
    train_csvs += list(paths.list_files(train_paths, validExts="csv"))
    val_csvs += list(paths.list_files(val_paths, validExts="csv"))
    test_csvs += list(paths.list_files(test_paths, validExts="csv"))

print("# train : {}, val: {}, and test files: {}".format(len(train_csvs), len(val_csvs), len(test_csvs)))

# train : 61, val: 12, and test files: 18


In [3]:
# load the first csv file in the train set
# as an example
some_csv = pd.read_csv(train_csvs[0])
some_csv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,gz,ax,ay,az,mx,my,mz,x,y,z
0,-37.0,-36.0,-36.0,-32.0,-33.0,-61.0,-52.0,-52.0,-70.0,-77.0,...,0.04215,-0.287304,9.370906,-2.770091,-11.04,-56.399998,18.42,0.016694,7.741971,1.801771
1,-37.0,-36.0,-36.0,-32.0,-33.0,-61.0,-52.0,-52.0,-70.0,-77.0,...,0.012828,0.356736,9.595961,-2.616862,-11.52,-55.62,18.359999,-0.008196,7.74229,1.801771
2,-37.0,-36.0,-36.0,-32.0,-33.0,-61.0,-52.0,-52.0,-70.0,-77.0,...,-0.036652,-0.047884,9.344569,-3.471592,-11.28,-57.239998,19.56,-0.033086,7.742609,1.801771
3,-37.0,-36.0,-36.0,-32.0,-33.0,-61.0,-52.0,-52.0,-70.0,-77.0,...,-0.01405,-0.318429,9.284714,-3.045425,-11.759999,-55.14,18.66,-0.057977,7.742928,1.801771
4,-37.0,-36.0,-36.0,-32.0,-33.0,-61.0,-52.0,-52.0,-70.0,-77.0,...,0.010996,0.371101,9.466674,-3.627216,-11.82,-54.78,18.48,-0.082867,7.743247,1.801771


In [4]:
# function to calculate euclidean distance
# between predicted and target positions
def euclidean_distance(targets, preds):
    errors_all = []
    for pred, target in zip(preds, targets):
        # calculate the euclidean distance between
        # two vectors
        error_cur = np.linalg.norm(pred - target)
        # add the error to the list
        errors_all.append(error_cur)
    
    # return the mean error
    return np.mean(errors_all)

In [5]:
# function to construct a feature matrix and 
# target vector for training ML models
def build_x_y(csv_paths):
    # loop over the list of csv files
    for ind, csv_path in enumerate(csv_paths, 1):
        csv_data = pd.read_csv(csv_path).to_numpy()
        rows, cols = csv_data.shape
        
        # features and targets of the current file
        X_ind = csv_data[:, :220] # RSS signals
        X_ind[np.where(X_ind==0)] = -100.0
        y_ind = csv_data[:, -3:]  # xyz positions
        
        # downsample 
        for i in range(rows):
            if ind == 1 and i == 0:
                X = X_ind[i:i+1, :]
                y = y_ind[i:i+1, :]
                continue
            
            euc_dst = np.linalg.norm(X[-1, :] - X_ind[i, :]) 
            if euc_dst > 0:
                X = np.concatenate((X, X_ind[i:i+1, :]), axis=0)
                y = np.concatenate((y, y_ind[i:i+1, :]), axis=0)
            else:
                continue        
            
    return X, y

In [6]:
X_train, y_train = build_x_y(train_csvs)

print(X_train.shape, y_train.shape)

(6049, 220) (6049, 3)


In [7]:
X_val, y_val = build_x_y(val_csvs)

print(X_val.shape, y_val.shape)

(1976, 220) (1976, 3)


In [8]:
X_test, y_test = build_x_y(test_csvs)

print(X_test.shape, y_test.shape)

(2601, 220) (2601, 3)


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'bootstrap': [False], 'n_estimators': [100, 200], 'max_features': [50, 100]},
  ]

forest_reg = RandomForestRegressor(n_jobs=10, random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=10,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=N

In [10]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 50, 'n_estimators': 200}

In [11]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

1.902076419165772 {'n_estimators': 100, 'bootstrap': False, 'max_features': 50}
1.8907510349370509 {'n_estimators': 200, 'bootstrap': False, 'max_features': 50}
1.9628386381094654 {'n_estimators': 100, 'bootstrap': False, 'max_features': 100}
1.9866590245213485 {'n_estimators': 200, 'bootstrap': False, 'max_features': 100}


In [12]:
from sklearn.metrics import mean_squared_error

forest_preds = grid_search.best_estimator_.predict(X_val)

forest_mse = mean_squared_error(y_val, forest_preds)
forest_rmse = np.sqrt(forest_mse)
forest_euc = euclidean_distance(y_val, forest_preds)

print("Random Forest. RMSE: {:.3f}, MSE: {:.3f}, ED: {:.3f}".format(forest_rmse, forest_mse, forest_euc))

Random Forest. RMSE: 1.197, MSE: 1.433, ED: 1.649


In [13]:
forest_preds = grid_search.best_estimator_.predict(X_test)

forest_mse = mean_squared_error(y_test, forest_preds)
forest_rmse = np.sqrt(forest_mse)
forest_euc = euclidean_distance(y_test, forest_preds)

print("Random Forest. RMSE: {:.3f}, MSE: {:.3f}, ED: {:.3f}".format(forest_rmse, forest_mse, forest_euc))

Random Forest. RMSE: 1.156, MSE: 1.336, ED: 1.567
