**Building a new dataframe for tests**
- It takes the filtered_data.csv file and transforms it into a reduced dataset for tests;
- The data is grouped by LABEL and DEVICE simultaneously

In [None]:
import os
import pandas as pd
from pathlib import Path

# Find project root (goes up from notebooks directory to thesis)
project_root = Path(os.path.abspath('')).parent

# Construct data path relative to project root
data_tests_path = project_root / 'data' / 'filtered_data.csv'

# Read the CSV file (raw data)
df_filtered = pd.read_csv(data_tests_path)

# Group by columns 'LABEL' and 'DEVICE'. The resulting WAP values correspond to the mean of each group, except when it is equal to 100.
# Get list of WAP columns
wap_columns = [col for col in df_filtered.columns if col.startswith('WAP')]

# Define a custom aggregation function
def custom_mean(series):
    # Filter out values equal to 100
    filtered_values = series[series != 100]
    # If no values remain after filtering, return 100
    # Otherwise, return the mean of the filtered values
    return 100 if filtered_values.empty else round(filtered_values.mean(), 2)

# Create a dictionary to specify aggregation for each column
agg_dict = {}
# For WAP columns, use our custom function
for col in wap_columns:
    agg_dict[col] = custom_mean
# For X and Y coordinates, use mean
agg_dict['X'] = 'mean'
agg_dict['Y'] = 'mean'

# Group by 'LABEL' and 'DEVICE' and apply the aggregations
df_tests = df_filtered.groupby(['LABEL', 'DEVICE']).agg(agg_dict).reset_index()

# Check the resulting DataFrame
print(f"Test set shape: {df_tests.shape}")
print(f"Number of unique locations: {df_tests['LABEL'].nunique()}")
print(f"Number of unique devices: {df_tests['DEVICE'].nunique()}")

In [2]:
# Now, generate the test set CSV file
test_set_path = project_root / 'data' / 'test_data.csv'
df_tests.to_csv(test_set_path, index=False)