In [1]:
# Python >= 3.5 is required
import sys
assert sys.version_info >= (3,5)

# Scikit-Learn >= 0.20 is required
import sklearn 
assert sklearn.__version__ >= "0.20"

# common imports
import pandas as pd
import numpy as np 
import os 

PROJECT_ROOT_DIR = "."
DATASET_PATH = os.path.join(PROJECT_ROOT_DIR, "wifi_dataset")

In [2]:
# path to csv files
train_csv_path = os.path.join(DATASET_PATH, "train")
val_csv_path = os.path.join(DATASET_PATH, "val")
test_csv_path = os.path.join(DATASET_PATH, "test")

In [4]:
# function to construct a feature matrix and 
# the target vector for training models
def build_x_y(csv_paths):
    # total number of csv files in
    # the folder
    num_csvs = len(os.listdir(csv_paths))
    
    # loop over the csv files
    for i in range(num_csvs):
        # read the csv file
        csv_data = pd.read_csv(os.path.join(csv_paths, "{}.csv".format(i+1)), header=None).to_numpy()
        
        # features and targets
        X_i = csv_data[:, :220] # RSS signals
        y_i = csv_data[:, -3:]  # xyz positions
        
        if i == 0:
            X = X_i
            y = y_i
        else:
            X = np.concatenate((X, X_i), axis=0)
            y = np.concatenate((y, y_i), axis=0)
            
    return X, y

In [6]:
X_train, y_train = build_x_y(train_csv_path)

print(X_train.shape, y_train.shape)

(6049, 220) (6049, 3)


In [7]:
X_val, y_val = build_x_y(val_csv_path)

print(X_val.shape, y_val.shape)

(1976, 220) (1976, 3)


In [5]:
# function to construct a feature matrix 
def build_x(csv_paths):
    # total number of csv files in
    # the folder
    num_csvs = len(os.listdir(csv_paths))
    
    # loop over the csv files
    for i in range(num_csvs):
        # read the csv file
        csv_data = pd.read_csv(os.path.join(csv_paths, "{}.csv".format(i+1)), header=None).to_numpy()
        
        if i == 0:
            X = csv_data
        else:
            X = np.concatenate((X, csv_data), axis=0)
            
    return X

In [8]:
X_test = build_x(test_csv_path)

print(X_test.shape)

(2601, 220)


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'bootstrap': [False], 'n_estimators': [10, 20], 'max_features': [5, 10]},
  ]

forest_reg = RandomForestRegressor(n_jobs=10, random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=10,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs=N

In [10]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 10, 'n_estimators': 20}

In [3]:
# function to calculate euclidean distance
# between predicted and target positions
def euclidean_distance(targets, preds):
    errors_all = []
    for pred, target in zip(preds, targets):
        # calculate the euclidean distance between
        # two vectors
        error_cur = np.linalg.norm(pred - target)
        # add the error to the list
        errors_all.append(error_cur)
    
    # return the mean error
    return np.mean(errors_all)

In [11]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

2.312796809709961 {'max_features': 5, 'bootstrap': False, 'n_estimators': 10}
2.1364291274500435 {'max_features': 5, 'bootstrap': False, 'n_estimators': 20}
2.2085877087705876 {'max_features': 10, 'bootstrap': False, 'n_estimators': 10}
2.037025936527828 {'max_features': 10, 'bootstrap': False, 'n_estimators': 20}


In [12]:
from sklearn.metrics import mean_squared_error

forest_preds = grid_search.best_estimator_.predict(X_val)

forest_mse = mean_squared_error(y_val, forest_preds)
forest_rmse = np.sqrt(forest_mse)
forest_euc = euclidean_distance(y_val, forest_preds)

print("Random Forest. RMSE: {:.3f}, MSE: {:.3f}, ED: {:.3f}".format(forest_rmse, forest_mse, forest_euc))

Random Forest. RMSE: 1.452, MSE: 2.109, ED: 2.000


In [13]:
name = "John"
surname = "Snow"

forest_preds = grid_search.best_estimator_.predict(X_test)
pd.DataFrame(forest_preds).to_csv("{}_{}.csv".format(name, surname), header=None, index=None)