In [None]:
# Python >= 3.5 is required
import sys
assert sys.version_info >= (3,5)

# Scikit-Learn >= 0.20 is required
import sklearn 
assert sklearn.__version__ >= "0.20"

# common imports
import pandas as pd
import numpy as np 
import os 

PROJECT_ROOT_DIR = "."
DATASET_PATH = os.path.join(PROJECT_ROOT_DIR, "wifi_dataset")

## Part 1. <font color=green>Data Loading</font>

### The dataset is split into 3 parts (train, validation and test) and stored in a directory located at DATASET_PATH
The data is stored in the csv format. Each split contains some number of csv files. Each csv file in the TRAIN and VALIDATION splits contains 223 columns: the first 220 represent the access points from which the sensor data was received; the last 3 are the ground truth values on the x,y,z location. Note, there is NO header with column names in any of the csv files.

In [None]:
# path to csv files
path_to_train_csvs = os.path.join(DATASET_PATH, "train")
path_to_val_csvs = os.path.join(DATASET_PATH, "val")
path_to_test_csvs = os.path.join(DATASET_PATH, "test")

In [None]:
# read one of the train csv files, note it does not contain a header with column names
some_csv = pd.read_csv(os.path.join(path_to_train_csvs, "1.csv"), names = list(range(0, 220))+['x', 'y', 'z'])
some_csv.head()

Each csv file in the TEST split contains only 220 columns that represent the access points from which the sensor data was received; there are NO columns containing the ground truth values on the x,y,z location. 

In [None]:
# read one of the test csv files, note it does not contain a header with column names
some_csv = pd.read_csv(os.path.join(path_to_test_csvs, "1.csv"), header=None)
some_csv.head()

### Task 1. Write a function called <font color=blue>build_feats</font>, that constructs only a feature matrix.
The function takes only one parameter, a string containing the path to the data (csv files)
and returns a feature matrix of type np.ndarray

The function will be used to build test data.

In [None]:
def build_feats(path_to_csvs):
    #TODO: implement the function
    
    
    return feats

In [None]:
feats_test = build_feats(path_to_test_csvs)

# verify that the returned value is of type numpy.ndarray
assert(isinstance(feats_test, np.ndarray))

# verify dimensions of the returned feature matrix
assert(feats_test.shape == (2601,220))


### Task 2. Write a function called <font color=blue>build_feats_targets</font>, that constructs a feature matrix and the corresponding target vector. 
The function takes only one parameter, a string containing the path to the data (csv files),
and returns a tuple containing the feature matrix and the target vector, both of type numpy.ndarray. Remember, the csv files contain both features and target values. The function should read the data from each csv file and concatenate the corresponding portions.

The function will be used to build train and validation data.


In [None]:
def build_feats_targets(path_to_csvs):
    #TODO: implement the function
    
    
    return feats, targets

In [None]:
train_data = build_feats_targets(path_to_train_csvs)

# verify that the returned value is indeed a tuple
assert(isinstance(train_data, tuple))

feats_train, targets_train = train_data

# verify that the returned tuple elements are indeed numpy.ndarray
assert(isinstance(feats_train, np.ndarray))
assert(isinstance(targets_train, np.ndarray))

# verify dimensions of the returned feature matrix and a target vector
assert(feats_train.shape == (6049,220))
assert(targets_train.shape == (6049,3))


In [None]:
val_data = build_feats_targets(path_to_val_csvs)

# test that the returned value is indeed a tuple
assert(isinstance(val_data, tuple))

feats_val, targets_val = val_data

# verify that the returned tuple elements are indeed numpy.ndarray
assert(isinstance(feats_val, np.ndarray))
assert(isinstance(targets_val, np.ndarray))

# verify dimensions of the returned feature matrix and a target vector
assert(feats_val.shape == (1976,220))
assert(targets_val.shape == (1976,3))

## Part 2. <font color=green>Predicting of a user's coordinates using randrom forest regression</font>


Random forest is a supervised learning algorithm which can be utilized for both classification and regression problems. It combines multiple decision trees that train on various subsets of given data. The trees are run independently from each other, and in parallel. The output of the random forest is the mode of the class predictions or the mean of the value predictions, for classification and regression tasks respectively.

Today we will use a random forest regressor for indoor localization by predicting the x,y,z coordinate values of a user based on the 220 signal values available at that point.


See the documentation on the list of parameters to tune https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# can be updated to include a greater variety of parameters and their values to explore
param_grid = [
    {'bootstrap': [False], 'n_estimators': [10, 20], 'max_features': [5, 10]},
  ]

# please fix the value of random_state to 42 for reproducibility
forest_reg = RandomForestRegressor(n_jobs=10, random_state=42)

# we recommend using the exhaustive search over specified parameter values,
# but feel free to explore other approaches
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(feats_train, targets_train)

In [None]:
# you can check out the result details in a dataframe format
cvres = grid_search.cv_results_
pd.set_option("max_colwidth", 80)
df_cvres = pd.DataFrame(cvres)
df[["params", "mean_test_score", "std_test_score"]]


### <font color=red>NOTE !<font> 
Scikit-Learn’s cross-validation features expect a utility function (greater is better) rather than a cost function (lower is better), so the scoring function is actually the opposite of the MSE (i.e., a negative value), which is why the preceding code computes -scores before calculating the square root.

    
From Geron, A. (2019). Hands-on machine learning with Scikit-Learn, Keras and TensorFlow: concepts, tools, and techniques to build intelligent systems (2nd ed.). O’Reilly.

In [None]:
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("MSE={} for model parameters {}".format(np.sqrt(-mean_score), params))

In [None]:
# or simply retrieve the parameters that gave the best score
grid_search.best_params_

### Task 3. Write a function called <font color=blue>euclidean_distance</font>
The function takes two parameters, a vector of targets (or ground truth values) and a vector of predictions,
and returns the euclidian distance between the two vectors.

In [None]:
def euclidean_distance(targets, preds):
    # TODO: implement the function
    
    return dist

### Task 4. Choose your best estimator and email it to us to win the competition.

The target values for the test set are withheld for the competition purposes. Instead, you are given the validation set to verify your best estimator.

In [None]:
from sklearn.metrics import mean_squared_error

forest_preds = grid_search.best_estimator_.predict(feats_val)

forest_mse = mean_squared_error(targets_val, forest_preds)
forest_rmse = np.sqrt(forest_mse)
forest_euc = euclidean_distance(targets_val, forest_preds)

print("Random Forest. RMSE: {:.3f}, MSE: {:.3f}, ED: {:.3f}".format(forest_rmse, forest_mse, forest_euc))

Email the predictions of your best estimator on the test features by <font color=red>11:30 AM UTC+6 June 25, 2021</font> to <font color=blue>issai@nu.edu.kz</font>

In [None]:
name = "John" # change to your first name
surname = "Snow" # change to your lastname
forest_preds = grid_search.best_estimator_.predict(feats_test)

# email your csv file to issai@nu.edu.kz
pd.DataFrame(forest_preds).to_csv("{}_{}.csv".format(name, surname), header=None, index=None)