In [1]:
import os
import sys
root_path = os.path.abspath("..")  
if root_path not in sys.path:
    sys.path.insert(0, root_path)
os.chdir("..")


In [2]:
from data.loader import load_complete_dataset
from data.preprocessing import add_module_columns

dataset = load_complete_dataset()
add_module_columns(dataset)

Unnamed: 0,device_id,accelerometer_x,accelerometer_y,accelerometer_z,gyroscope_x,gyroscope_y,gyroscope_z,magnetometer_x,magnetometer_y,magnetometer_z,timestamp,actitivy_label,participant_id,magnetometer_module,gyroscope_module,accelerometer_module
0,1.0,-2.82430,9.1773,2.2388,-1.43340,-0.003309,-0.41384,-0.85484,0.50402,0.674830,1735.1,1.0,14,1.200076,1.491948,9.859601
1,1.0,-2.80020,9.1655,2.2507,-1.04340,0.230960,-0.12186,-0.85887,0.52610,0.674830,1754.6,1.0,14,1.212367,1.075582,9.844448
2,1.0,-2.78810,9.1296,2.2634,-1.12080,0.215210,-0.10277,-0.85081,0.52811,0.683740,1774.2,1.0,14,1.212550,1.145893,9.810509
3,1.0,-2.82400,9.1173,2.2522,-1.15170,0.143390,0.32778,-0.84879,0.50402,0.688200,1793.7,1.0,14,1.203370,1.205991,9.806760
4,1.0,-2.82400,9.1414,2.2517,-1.36180,-0.113230,0.65311,-0.86290,0.51807,0.683740,1813.2,1.0,14,1.216755,1.514554,9.829055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53114,5.0,-0.48220,9.4431,-2.1914,0.70696,-0.734460,1.05760,-0.13535,0.71257,0.069565,1038000.0,1.0,0,0.728639,1.468925,9.706023
53115,5.0,-0.47013,9.4553,-2.1919,0.23155,-0.340820,0.98911,-0.12525,0.67265,0.071739,1038000.0,1.0,0,0.687962,1.071500,9.717415
53116,5.0,-0.49463,9.4428,-2.2155,0.53524,-0.291600,1.00350,-0.11919,0.69461,0.071739,1038000.0,1.0,0,0.708404,1.174106,9.711826
53117,5.0,-0.47013,9.4553,-2.1919,0.23140,-0.326390,0.89815,-0.11313,0.72056,0.084783,1038100.0,1.0,0,0.734298,0.983234,9.717415


In [5]:
!pip install tsfresh




[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import pandas as pd
import numpy as np

def detect_gaps(timestamps, gap_threshold):
    """
    Deteta intervalos onde a diferença entre amostras consecutivas excede o threshold.
    timestamps: numpy array de floats/ints (ordenado)
    gap_threshold: valor máximo permitido entre amostras
    Retorna: lista de tuplos (gap_start, gap_end)
    """
    # Calcula a diferença entre amostras consecutivas
    diffs = np.diff(timestamps)
    
    # Encontra índices onde a diferença é maior que o permitido
    gap_indices = np.where(diffs > gap_threshold)[0]
    
    gaps = []
    for idx in gap_indices:
        # O gap ocorre entre o elemento idx e o elemento idx+1
        start_gap = timestamps[idx]
        end_gap = timestamps[idx + 1]
        gaps.append((start_gap, end_gap))
        
    return gaps

def intersects_gap(win_start, win_end, gap_list):
    """
    Verifica se a janela [win_start, win_end) intersecta algum gap.
    """
    for g_start, g_end in gap_list:
        # Lógica de intersecção:
        # (StartGap < WinEnd) E (EndGap > WinStart)
        if g_start < win_end and g_end > win_start:
            return True
    return False

def create_sliding_windows(df, window_ms, step_ms, gap_threshold_ms=100):
    print("A preparar dados e detetar gaps...")
    
    # 1. Preparação Básica
    # Garantir ordenação (Crucial para searchsorted e diff)
    df = df.sort_values(by=["participant_id", "device_id", "timestamp"]).reset_index(drop=True)
    
    # Identificar colunas
    if 'activity_label' in df.columns: lbl_col = 'activity_label'
    elif 'actitivy_label' in df.columns: lbl_col = 'actitivy_label'
    else: lbl_col = 'activity'
    
    # Colunas de sensores para extrair
    sensor_cols = [c for c in df.columns if c not in 
                   ['participant_id', 'device_id', 'timestamp', 'datetime', 'grid_time', lbl_col, 'window_id']]

    # Estruturas de dados para o loop
    windows_data = []
    w_id_global = 0
    
    # 2. Iterar por Participante (Processamento Independente)
    for p_id, df_p in df.groupby("participant_id"):
        
        # Dicionários para acesso rápido aos dados deste participante
        # Chave: device_id -> Valor: (timestamps_array, labels_array, values_matrix)
        devices_data = {}
        device_gaps = {}
        device_bounds = []
        
        unique_devices = df_p['device_id'].unique()
        
        # --- PRÉ-PROCESSAMENTO DO PARTICIPANTE ---
        valid_participant = True
        
        for d_id in unique_devices:
            # Extrair dados do dispositivo
            df_d = df_p[df_p['device_id'] == d_id]
            ts = df_d['timestamp'].values
            
            if len(ts) == 0:
                valid_participant = False
                break
                
            # Detetar Gaps
            gaps = detect_gaps(ts, gap_threshold_ms)
            device_gaps[d_id] = gaps
            
            # Guardar dados em memória (numpy é mais rápido que pandas no loop)
            # Guardamos o DataFrame slice também para facilitar a extração final
            devices_data[d_id] = {
                'ts': ts,
                'lbl': df_d[lbl_col].values,
                'df_slice': df_d  # Mantemos o slice do pandas para extrair colunas depois
            }
            
            # Guardar limites (min, max)
            device_bounds.append((ts[0], ts[-1]))
            
        if not valid_participant or not device_bounds:
            continue
            
        # 3. CALCULAR JANELA DE OPERAÇÃO COMUM (Global Bounds)
        # O início global é o MAIOR dos inícios (todos têm de ter começado)
        start_global = max(b[0] for b in device_bounds)
        # O fim global é o MENOR dos fins (todos têm de estar ativos)
        end_global = min(b[1] for b in device_bounds)
        
        # Se a janela útil for menor que o tamanho da janela, ignorar participante
        if (end_global - start_global) < window_ms:
            continue
            
        # 4. LOOP DE GERAÇÃO DE JANELAS
        curr_time = start_global
        
        while curr_time + window_ms <= end_global:
            win_start = curr_time
            win_end = curr_time + window_ms
            
            window_is_valid = True
            temp_window_chunks = [] # Para armazenar os dados se a janela for válida
            
            # Validar contra TODOS os dispositivos
            for d_id in unique_devices:
                # A. Checar Gaps
                if intersects_gap(win_start, win_end, device_gaps[d_id]):
                    window_is_valid = False
                    break
                
                # B. Obter Índices (SearchSorted é O(log N) - Muito Rápido)
                ts_arr = devices_data[d_id]['ts']
                # Encontrar onde começa e onde acaba a janela neste array
                idx_start = np.searchsorted(ts_arr, win_start, side='left')
                idx_end = np.searchsorted(ts_arr, win_end, side='left')
                
                # C. Validar se existem dados suficientes (opcional mas recomendado)
                # Se searchsorted devolver o mesmo índice, não há dados no intervalo
                if idx_start == idx_end:
                    window_is_valid = False
                    break
                    
                # D. Verificar pureza das labels
                lbl_arr = devices_data[d_id]['lbl']
                labels_in_window = lbl_arr[idx_start:idx_end]
                
                # np.unique é rápido. Se len > 1, tem labels misturadas.
                if len(np.unique(labels_in_window)) > 1:
                    window_is_valid = False
                    break
                
                # Se passou tudo, preparar o chunk
                # Pegamos o slice do DataFrame original usando iloc relativo
                df_slice = devices_data[d_id]['df_slice'].iloc[idx_start:idx_end].copy()
                temp_window_chunks.append(df_slice)
            
            # SE A JANELA FOR VÁLIDA EM TODOS OS DEVICES
            if window_is_valid:
                # Concatenar todos os chunks desta janela
                # (Eles ficam empilhados verticalmente, como tinhas no original)
                win_df = pd.concat(temp_window_chunks)
                win_df['window_id'] = w_id_global
                windows_data.append(win_df)
                
                w_id_global += 1
            
            # Avançar no tempo
            curr_time += step_ms

    print(f"Processamento concluído. {len(windows_data)} janelas geradas.")
    
    if not windows_data:
        return pd.DataFrame()

    # 5. RECONSTRUÇÃO FINAL
    final_df = pd.concat(windows_data, ignore_index=True)
    
    # Recriação das Colunas de Array (Listas) para TSFresh
    acc_cols = ['accelerometer_x', 'accelerometer_y', 'accelerometer_z']
    gyro_cols = ['gyroscope_x', 'gyroscope_y', 'gyroscope_z']
    mag_cols = ['magnetometer_x', 'magnetometer_y', 'magnetometer_z']
    
    # Verifica quais colunas existem e cria as listas
    if all(c in final_df.columns for c in acc_cols):
        final_df['acc_array'] = final_df[acc_cols].values.tolist()
    if all(c in final_df.columns for c in gyro_cols):
        final_df['gyro_array'] = final_df[gyro_cols].values.tolist()
    if all(c in final_df.columns for c in mag_cols):
        final_df['mag_array'] = final_df[mag_cols].values.tolist()
    if all(c in final_df.columns for c in acc_cols + gyro_cols):
        final_df['acc_gyro_array'] = final_df[acc_cols + gyro_cols].values.tolist()

    return final_df


import pandas as pd
import tsfresh
from tsfresh.feature_extraction.feature_calculators import set_property
from tsfresh.feature_extraction import feature_calculators
from tsfresh.utilities.string_manipulation import convert_to_output_format
import numpy as np
from config.constants import ACC_ARRAY_COLUMN, ACC_GYRO_COLUMN, COLUMNS_ACCELEROMETER, COLUMNS_GYROSCOPE, COLUMNS_MAGNETOMETER,DEVICES, GYRO_ARRAY_COLUMN
from tsfresh import extract_features


################################Physicial features#######################################
# =============================================================================
# 1. INTENSIDADE DE MOVIMENTO (MI)
# =============================================================================
@set_property("fctype", "combiner")
def mi_intensity(acc_array,param):
    arr = np.vstack(acc_array).astype(float)  # transforma em shape (N,3)
    magn = np.linalg.norm(arr, axis=1)
    avg_intensity = float(np.mean(magn)) if magn.size else 0.0
    variance_intensity = float(np.var(magn)) if magn.size else 0.0
    return [("avg",avg_intensity),("var",variance_intensity)]

# =============================================================================
# 2. SMA (Signal Magnitude Area)
# =============================================================================

@set_property("fctype", "simple")
def sma(acc_array):
    arr = np.vstack(acc_array).astype(float)
    sma_vals = np.sum(np.abs(arr), axis=1)
    return float(np.mean(sma_vals)) if sma_vals.size else 0.0


# =============================================================================
# 3. VALORES PRÓPRIOS EVA
# =============================================================================
@set_property("fctype", "combiner")
def EVA(acc_array,param):
    arr = np.vstack(acc_array).astype(float)
    if arr.shape[0] < 2:
        return [("ver", 0.0),("head", 0.0)]
    cov = np.cov(arr, rowvar=False)
    vals = np.linalg.eigvalsh(cov)
    vals_sorted = np.sort(vals)
    return [("ver", float(vals_sorted[0])), ("head", float(vals_sorted[-1]))]


# =============================================================================
# 4. CAGH – Correlação gravidade (aprox eixo X) × heading (norma YZ)
# =============================================================================

@set_property("fctype", "simple")
def cagh(acc_array):
    arr = np.vstack(acc_array).astype(float)
    if arr.shape[1] < 3:
        return 0.0
    ax = arr[:, 0]              # direção da gravidade
    heading = np.linalg.norm(arr[:, 1:3], axis=1)
    if len(heading) < 2:
        return 0.0
    # derivada discreta da heading
    heading_deriv = np.diff(heading)
    ax_trunc = ax[1:]
    if np.std(ax_trunc) == 0 or np.std(heading_deriv) == 0:
        return 0.0
    return float(np.corrcoef(ax_trunc, heading_deriv)[0, 1])


# =============================================================================
# 5. AVH – Velocidade média ao longo da direção de avanço (heading)
# =============================================================================

@set_property("fctype", "simple")
def avh(acc_array):
    arr = np.vstack(acc_array).astype(float)
    if arr.shape[1] < 3:
        return 0.0
    heading_acc = np.linalg.norm(arr[:, 1:3], axis=1)
    # integração sem dt → acumulado discreto (unidades arbitrárias)
    velocity = np.cumsum(heading_acc)

    return float(np.mean(np.abs(velocity))) if velocity.size else 0.0


# =============================================================================
# 6. AVG – Velocidade média ao longo da direção da gravidade (aprox eixo X)
# =============================================================================

@set_property("fctype", "simple")
def avg(acc_array):
    arr = np.vstack(acc_array).astype(float)
    ax = arr[:, 0]
    velocity = np.cumsum(ax)
    return float(np.mean(np.abs(velocity))) if velocity.size else 0.0


# =============================================================================
# 7. ARATG – Ângulos médios de rotação (usa gyro_x como rotação no eixo gravidade)
# =============================================================================

@set_property("fctype", "simple")
def aratg(gyro_array):
    arr = np.vstack(gyro_array).astype(float)
    gx = arr[:, 0]  # rotação em torno da gravidade
    angle_increment = np.abs(gx)  # sem dt → soma discreta
    return float(np.mean(angle_increment)) if angle_increment.size else 0.0


# =============================================================================
# 8. DF – Dominant Frequency (por eixo individual)
# =============================================================================

@set_property("fctype", "simple")
def dominant_frequency_axis(signal_1d):
    arr = np.asarray(signal_1d, dtype=float)
    if arr.size < 2:
        return 0.0
    arr = arr - np.mean(arr)
    fft_vals = np.fft.rfft(arr)
    power = np.abs(fft_vals)**2
    power[0] = 0  # excluir DC
    idx = np.argmax(power)
    return float(idx)  # índice do pico (freq ≈ idx se dt=1)


# =============================================================================
# 9. ENERGY – Energia por eixo
# =============================================================================

@set_property("fctype", "simple")
def energy_axis(signal_1d):
    s = np.asarray(signal_1d, dtype=float)
    if s.size < 2:
        return 0.0
    s = s - np.mean(s)
    fft_vals = np.fft.rfft(s)
    power = np.abs(fft_vals)**2
    power[0] = 0
    return float(np.sum(power))


# =============================================================================
# 10. AAE – Energia média dos 3 eixos do acelerómetro
# =============================================================================

@set_property("fctype", "simple")
def aae(acc_array):
    arr = np.vstack(acc_array).astype(float)
    energy = np.sum(arr**2, axis=1)  # ax² + ay² + az²
    return float(np.mean(energy)) if energy.size else 0.0


# =============================================================================
# 11. ARE – Energia média dos 3 eixos do giroscópio
# =============================================================================
@set_property("fctype", "simple")
def are(gyro_array):
    arr = np.vstack(gyro_array).astype(float)
    energy = np.sum(arr**2, axis=1)

    return float(np.mean(energy)) if energy.size else 0.0

#####################Features estatisticas######################################

@set_property("fctype", "simple")
def mean_crossing_rate(x):
    """
    Número de cruzamentos da série com a sua média (Mean Crossing Rate).
    """
    arr = np.asarray(x, dtype=float)
    mean_val = np.mean(arr)
    # vetor booleano indicando se cada ponto está acima ou abaixo da média
    above = arr > mean_val
    # cruzamentos acontecem quando o estado muda de True → False ou False → True
    crossings = np.sum(above[:-1] != above[1:])
    return int(crossings)


@set_property("fctype", "simple")
def spectral_entropy(x: pd.Series) -> float:
    y = np.array(x, dtype=float) 
    y = y - y.mean()
    psd = np.abs(np.fft.rfft(y)) ** 2
    psd_sum = psd.sum()
    if psd_sum == 0:
        return 0.0
    p = psd / psd_sum
    p = p[p > 0]
    se = -np.sum(p * np.log2(p))
    return float(se / np.log2(len(psd)))
@set_property("fctype", "simple")
def interq_range(x):
    """
    Amplitude Interquartil (IQR = P75 - P25)
    """
    arr = np.asarray(x, dtype=float)

    if arr.size == 0:
        return 0.0

    q75 = np.percentile(arr, 75)
    q25 = np.percentile(arr, 25)
    return float(q75 - q25)


@set_property("fctype", "simple")
def averaged_derivative(x: pd.Series,**kwargs) -> float:
    """
    Derivada média absoluta de um sinal 1D.
    Usada em reconhecimento de fala e escrita.
    """
    arr = np.asarray(x, dtype=float)

    if arr.size < 2:
        return 0.0

    # primeira derivada: |x[i] - x[i-1]|
    derivative = np.abs(np.diff(arr))

    return float(np.mean(derivative))



@set_property("fctype", "combiner")
def pairwise_correlation(x, param):
    arr = np.array(x.tolist())  # N x 6
    acc = arr[:, :3]
    gyro = arr[:, 3:]
    
    features = []
    
    # Acc interno
    corr_acc = np.corrcoef(acc, rowvar=False)
    features.append(('corr_acc_xy', corr_acc[0,1]))
    features.append(('corr_acc_xz', corr_acc[0,2]))
    features.append(('corr_acc_yz', corr_acc[1,2]))
    
    # Gyro interno
    corr_gyro = np.corrcoef(gyro, rowvar=False)
    features.append(('corr_gyro_xy', corr_gyro[0,1]))
    features.append(('corr_gyro_xz', corr_gyro[0,2]))
    features.append(('corr_gyro_yz', corr_gyro[1,2]))
    
    # Acc vs Gyro
    axes = ["x","y","z"]
    for i, a in enumerate(axes):
        for j, g in enumerate(axes):
            features.append((f'acc_{a}_gyro_{g}', np.corrcoef(acc[:,i], gyro[:,j])[0,1]))    
    return [(name, val) for name,val in features]



setattr(tsfresh.feature_extraction.feature_calculators, 'mean_crossing_rate', mean_crossing_rate)
setattr(tsfresh.feature_extraction.feature_calculators, 'averaged_derivative', averaged_derivative)
setattr(tsfresh.feature_extraction.feature_calculators, 'spectral_entropy', spectral_entropy)
setattr(tsfresh.feature_extraction.feature_calculators,"pairwise_correlation",pairwise_correlation)
setattr(tsfresh.feature_extraction.feature_calculators,"interq_range",interq_range)

setattr(tsfresh.feature_extraction.feature_calculators,"mi_intensity", mi_intensity)
setattr(tsfresh.feature_extraction.feature_calculators,"sma", sma)
setattr(tsfresh.feature_extraction.feature_calculators,"EVA", EVA)
setattr(tsfresh.feature_extraction.feature_calculators,"cagh",cagh)
setattr(tsfresh.feature_extraction.feature_calculators,"avh",avh)
setattr(tsfresh.feature_extraction.feature_calculators,"aae",aae)
setattr(tsfresh.feature_extraction.feature_calculators,"avg",avg)
setattr(tsfresh.feature_extraction.feature_calculators,"aratg",aratg)
setattr(tsfresh.feature_extraction.feature_calculators,"are",are)
setattr(tsfresh.feature_extraction.feature_calculators,"dominant_frequency_axis",dominant_frequency_axis)
setattr(tsfresh.feature_extraction.feature_calculators,"energy_axis",energy_axis)






def to_long_format(df, label_col='actitivy_label'):
    df = df.copy()

    # 1. Criar o ID Único (User_Device_Window)
    # Convertemos para int->str para garantir formato limpo "14_1_0"
    p_str = df['participant_id'].astype(float).astype(int).astype(str)
    d_str = df['device_id'].astype(float).astype(int).astype(str)
    w_str = df['window_id'].astype(float).astype(int).astype(str)
    
    df['id'] = p_str + "_" + d_str + "_" + w_str

    # 2. Extrair o Mapeamento de Labels (ID -> Activity)
    # Como assumimos que a janela é pura, basta pegar a primeira linha de cada ID
    labels_map = df.drop_duplicates(subset=['id']).set_index('id')[label_col]

    # 3. Criar Eixo do Tempo (0, 1, 2... N dentro da janela)
    df['time'] = df.groupby('id').cumcount()

    # 4. Transformar em Long Format (Melt)
    vars_to_melt = (
        COLUMNS_ACCELEROMETER + 
        COLUMNS_GYROSCOPE + 
        COLUMNS_MAGNETOMETER + 
        [ACC_GYRO_COLUMN, ACC_ARRAY_COLUMN, GYRO_ARRAY_COLUMN]
    )
    
    # Filtra apenas as colunas que realmente existem no df
    vars_present = [v for v in vars_to_melt if v in df.columns]

    long_df = df.melt(
        id_vars=['id', 'time'],
        value_vars=vars_present,
        var_name='kind',
        value_name='value'
    )
    
    # Retorna o DataFrame Longo E o Mapa de Labels
    return long_df, labels_map


def df_to_tsfresh_format(df):
    long_df = pd.DataFrame()
    
    for col in df.columns:
        if col in ["window_id", "time", "device_id", "id","participant_id"]:
            continue
        
        tmp = pd.DataFrame({
            "id": df["id"],
            "time": df["time"],
            "kind": col,  # acc_x, acc_y, etc.
            "value": df[col]
        })
        
        long_df = pd.concat([long_df, tmp], ignore_index=True)

    return long_df


def extract_features_sensors(dataframe, window_size, step_size):
    # 1) Criar janelas
    print("starting")
    df_windowed = create_sliding_windows(
    df=dataframe, 
    window_ms=window_size,           # Quero 2 segundos
    step_ms=step_size   # 50% de sobreposição
)

   
    df = df_windowed[
        COLUMNS_ACCELEROMETER +
        COLUMNS_GYROSCOPE +
        COLUMNS_MAGNETOMETER +
        ["device_id","window_id","participant_id","actitivy_label", ACC_GYRO_COLUMN, ACC_ARRAY_COLUMN, GYRO_ARRAY_COLUMN]
    ]
    long_df,labels = to_long_format(df)
    

    stat_features = {
        "mean": None,
        "median": None,
        "standard_deviation": None,
        "variance": None,
        "root_mean_square": None,
        "skewness": None,
        "kurtosis": None,
        "number_crossing_m": [{"m": 0}],   # zero-crossing
        "mean_crossing_rate": None,
        "averaged_derivative": None,
        "spectral_entropy": None,
        "interq_range": None,
    }

    # aplicar estatísticas a todos os eixos individuais
    kind_to_fc_parameters = {k: stat_features.copy()
                             for k in COLUMNS_ACCELEROMETER + COLUMNS_GYROSCOPE}

    # features combinadas acc+gyro
    kind_to_fc_parameters[ACC_GYRO_COLUMN] = {
        "pairwise_correlation": None
    }

    # ---------------------------------------------------------
    # 5. FEATURE SET – FÍSICAS
    # ---------------------------------------------------------
    physical_features = {
        "dominant_frequency_axis": None,
        "energy_axis": None
    }

    # adicionar físicas aos eixos individuais
    for k in COLUMNS_ACCELEROMETER + COLUMNS_GYROSCOPE:
        kind_to_fc_parameters[k].update(physical_features)

    # 5.1 — features específicas para acc_array (3D)
    kind_to_fc_parameters[ACC_ARRAY_COLUMN] = {
        "mi_intensity": None,
        "sma": None,
        "EVA": None,
        "cagh": None,
        "avh": None,
        "aae": None,
    }

    # 5.2 — gyroscope_x recebe AVG
    if "gyroscope_x" in kind_to_fc_parameters:
        kind_to_fc_parameters["gyroscope_x"].update({"avg": None})

    # 5.3 — gyro_array recebe ARE e ARATG
    kind_to_fc_parameters[GYRO_ARRAY_COLUMN] = {
        "aratg": None,
        "are": None
    }

    # ---------------------------------------------------------
    # 6. EXTRAÇÃO FINAL DE FEATURES
    # ---------------------------------------------------------
    print("Extracting features...")
    features = extract_features(
        long_df,
        column_id="id",
        column_sort="time",
        column_kind="kind",
        column_value="value",
        kind_to_fc_parameters=kind_to_fc_parameters,
        n_jobs=0
    )
    features['activity'] = labels
    return features

In [7]:
features = extract_features_sensors(dataset,2000, 1000) ## 2 segundos de janela, 1 segundo de overlap

starting
A preparar dados e detetar gaps...
Processamento concluído. 12337 janelas geradas.
Extracting features...


Feature Extraction: 100%|██████████| 740220/740220 [24:29<00:00, 503.85it/s] 


In [8]:
def create_clean_database(features):
    df_final = features.copy()


    if 'Unnamed: 0' in df_final.columns:
        id_source = df_final['Unnamed: 0'].astype(str)
        df_final.drop(columns=['Unnamed: 0'], inplace=True)
    else:
        id_source = df_final.index.to_series().astype(str)

    ids_split = id_source.str.split('_', expand=True)


    df_final.insert(0, 'participant_id', pd.to_numeric(ids_split[0]).astype(int))
    

    df_final.insert(1, 'device_id', pd.to_numeric(ids_split[1]).astype(int))
    
    # Window (Coluna 2 do split)
    df_final.insert(2, 'window_id', pd.to_numeric(ids_split[2]).astype(int))
    

    df_final = df_final.reset_index(drop=True)
    
    return df_final


df_clean = create_clean_database(features)


In [9]:
import pandas as pd

def reshape_features_robust(df_clean):
    df_clean = df_clean.copy()
    
    # 1. TRATAMENTO DE TIPOS E SORTING
    try:
        df_clean['participant_id'] = df_clean['participant_id'].astype(float).astype(int)
        df_clean['device_id'] = df_clean['device_id'].astype(float).astype(int)
        df_clean['window_id'] = df_clean['window_id'].astype(float).astype(int)
    except Exception as e:
        print(f"Aviso tipos: {e}")

    df_clean = df_clean.sort_values(by=['participant_id', 'device_id', 'window_id'])

    # 2. DETETAR COLUNA DE LABEL
    # Verifica qual o nome correto da coluna
    if 'activity_label' in df_clean.columns: 
        lbl_col = 'activity_label'
    elif 'activity' in df_clean.columns: 
        lbl_col = 'activity'
    elif 'actitivy_label' in df_clean.columns: # Caso haja typo antigo
        lbl_col = 'actitivy_label'
    else:
        print("ERRO: Coluna de atividade não encontrada!")
        return None

    # 3. CRIAR ID ALINHADO
    # Nota: Isto só funciona bem porque garantimos antes que todos os devices têm o mesmo nº de janelas
    df_clean['aligned_window_id'] = df_clean.groupby(['participant_id', 'device_id']).cumcount()

    # Agrupamos por Participante e Janela e pegamos na primeira label que encontrarmos
    # (Como a janela é pura, a label é igual para o device 1, 2, 3...)
    y_labels = df_clean.groupby(['participant_id', 'aligned_window_id'])[lbl_col].first()
    
    # 4. PREPARAR PIVOT (Apenas features numéricas)
    cols_to_ignore = ['participant_id', 'window_id', 'device_id', lbl_col, 'id', 'Unnamed: 0', 'aligned_window_id']
    feature_cols = [c for c in df_clean.columns if c not in cols_to_ignore]
    
    # 5. PIVOT
    df_wide = df_clean.pivot(
        index=['participant_id', 'aligned_window_id'], 
        columns='device_id', 
        values=feature_cols
    )
    
    # 6. ACHATAR NOMES DAS COLUNAS (Flatten)
    # Transforma (mean_x, 1) em "S1_mean_x"
    df_wide.columns = [f"S{int(dev)}_{feat}" for feat, dev in df_wide.columns]
    
    # --- O FIX: JUNTAR AS LABELS DE VOLTA ---
    # O join funciona automaticamente porque o index (participant, aligned_window) é igual nos dois
    df_wide = df_wide.join(y_labels)
    
    # 7. FINALIZAÇÃO
    df_wide = df_wide.reset_index()
    df_wide.rename(columns={'aligned_window_id': 'window_id'}, inplace=True)
    
    return df_wide

# Testar
X_final_550 = reshape_features_robust(df_clean)
print(f"Shape Final Obtido: {X_final_550.shape}")
if 'activity_label' in X_final_550.columns:
    print(X_final_550[['participant_id', 'window_id', 'activity_label']].head())

Shape Final Obtido: (12337, 553)


In [10]:
X_final_550.to_csv('X_final_550.csv', index=False)