## Setup

In [43]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import pandas as pd
import numpy as np
import os

# To make this notebook's output stable across runs
np.random.seed(42)

# To plot the figures
import matplotlib as mlp 
import matplotlib.pyplot as plt
mlp.rc('axes', labelsize=14)
mlp.rc('xtick', labelsize=12)
mlp.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

# Function to save the figures
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print('Saving figure', fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Get the  data

In [44]:
DATASETS_PATH = os.path.join(PROJECT_ROOT_DIR, 'datasets')
os.makedirs(DATASETS_PATH, exist_ok=True)

DATASET_NAME = 'dados 2-4.txt'

# columns names for the dataset
columns=['Distance','RSSI']

def load_data(data_path = DATASETS_PATH, dataset_name= DATASET_NAME, columns_names=columns):
    txt_path = os.path.join(data_path, dataset_name)
    return pd.read_table(txt_path, names=columns)

# read the dataset
data_2_4_Ghz = load_data(data_path=DATASETS_PATH, dataset_name=DATASET_NAME, columns_names=columns)
data_2_4_Ghz.head()


Unnamed: 0,Distance,RSSI
0,1,-21.0
1,2,-30.0
2,3,-29.0
3,4,-33.0
4,5,-32.0


## Exploratory Data Analisys

In [45]:
import sweetviz as sv

# Analyzing the data
report = sv.analyze(data_2_4_Ghz)

# Generating the report
report.show_html('EDA/data_2_4_Ghz.html')

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:02 -> (00:00 left)


Report EDA/data_2_4_Ghz.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Prepare the data for Machine Learning algorithms

In [46]:

from sklearn.model_selection import train_test_split

# Train test split
train_set, test_set = train_test_split(data_2_4_Ghz, test_size=0.2, random_state=42)

features = train_set.drop('RSSI', axis=1)
labels = train_set['RSSI'].copy()

### Tranformation Pipelines

In [47]:
# Create a pipeline to preprocess the data
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

numerical_pipeline = Pipeline([
        ("standardize", StandardScaler()),
])

In [48]:
from sklearn import set_config

# Show the pipeline in diagram
set_config(display='diagram')

numerical_pipeline

In [49]:
from sklearn.compose import ColumnTransformer

# Aplication of the pipeline
preprocessing = ColumnTransformer([
    ("num", numerical_pipeline,features.columns)
])

features_prepared = preprocessing.fit_transform(features)

# Create a dataframe to visualize the data after the preprocessing (optional)
df_features_prepared = pd.DataFrame(features_prepared, columns= preprocessing.get_feature_names_out(), index=features.index)
df_features_prepared.head(2)

Unnamed: 0,num__Distance
28,1.559572
24,1.11398


## Select and Train a Model

In [64]:
from sklearn.linear_model import LinearRegression

# Logistic Regression model
lin_reg = Pipeline([
    ("preprocessing", preprocessing), 
    ("linear_regression", LinearRegression())
])
lin_reg.fit(features, labels)

In [65]:
from sklearn.tree import DecisionTreeRegressor

# Decision Tree model
tree_reg = Pipeline([
    ("preprocessing", preprocessing), 
    ("tree_regression", DecisionTreeRegressor(random_state=42))
])
tree_reg.fit(features, labels)

In [66]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = Pipeline([
    ("preprocessing", preprocessing), 
    ("random_forest", RandomForestRegressor(random_state=42))
])
forest_reg.fit(features, labels)

### Evaluation using Cross-Validation

In [67]:
from sklearn. model_selection import cross_val_score

# Cross validation of Tree model
tree_rmses = -cross_val_score(tree_reg, features, labels, scoring="neg_root_mean_squared_error", cv =10)
pd.Series(tree_rmses).describe()

count    10.000000
mean      3.837828
std       1.600166
min       1.665893
25%       2.946939
50%       3.621595
75%       4.983466
max       6.158734
dtype: float64

In [68]:
# Cross validation of the linear model
lin_rmses = -cross_val_score(lin_reg, features, labels, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(lin_rmses).describe()

count    10.000000
mean      4.592811
std       1.882128
min       2.284751
25%       3.077085
50%       4.454559
75%       5.689237
max       8.405435
dtype: float64

In [69]:
# Cross validation of the forest model
forest_rmses = -cross_val_score(forest_reg, features, labels, scoring="neg_mean_squared_error", cv=10)
pd.Series(forest_rmses).describe()

count    10.000000
mean     11.123991
std       9.465100
min       1.823693
25%       4.706273
50%       7.965984
75%      14.111014
max      31.314950
dtype: float64

## Fine-Tune the Model

### Randomized Search

In [71]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {'tree_regression__max_features': randint(low=2, high=30), 
                  'tree_regression__min_samples_split': randint(low=2, high=10), 
                  'tree_regression__min_samples_leaf': randint(low=1, high=10),
                  'tree_regression__max_leaf_nodes': randint(low=10, high=100)}

rnd_search = RandomizedSearchCV(tree_reg, param_distributions=param_distribs, n_iter=100, cv=10, scoring='neg_root_mean_squared_error', random_state=42)

rnd_search.fit(features, labels)

### Analyze the Best Models

In [72]:
# Get the best model according to the random search
tree_reg_best = rnd_search.best_estimator_
tree_reg_best["tree_regression"]

## Evaluate the Model on the Test Set

In [75]:
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
# The test set
X_test = test_set.drop('RSSI', axis=1)
y_test = test_set['RSSI'].copy()

tree_reg_prediction = tree_reg_best.predict(X_test)

tree_reg_rmse = root_mean_squared_error(y_test, tree_reg_prediction)
tree_reg_mae = mean_absolute_error(y_test, tree_reg_prediction)

print(tree_reg_rmse)
print(tree_reg_mae)   

1.4393455045545807
1.3118888888888887


In [76]:
from scipy import stats

# Confidence interval (95%)

confidence = 0.95
squared_errors = (tree_reg_prediction - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

array([0.54704307, 1.96065674])