In [5]:
import pandas as pd
import numpy as np
from scipy.stats import entropy
from scipy.signal import find_peaks

# Load the dataset
df = pd.read_csv('Dataset/dataset.csv')

# Function to remove outliers
def remove_outliers(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Remove outliers from specific columns
columns_to_check = ['X', 'Y', 'Z', 'Mixed']
df_cleaned = remove_outliers(df, columns_to_check)

# Define a function to extract statistical features
def extract_statistical_features(data):
    mean = np.mean(data)
    std_dev = np.std(data)
    energy = np.sum(data**2)
    ent = entropy(data)
    peaks, _ = find_peaks(data)
    num_peaks = len(peaks)
    return mean, std_dev, energy, ent, num_peaks

# Group by 'Subject' and apply the feature extraction to each group
statistical_features = df_cleaned.groupby('ClassLabel').apply(
    lambda group: pd.Series({
        'mean_X': group['X'].mean(),
        'std_dev_X': group['X'].std(),
        'energy_X': np.sum(group['X']**2),
        'entropy_X': entropy(group['X']),
        'num_peaks_X': len(find_peaks(group['X'])[0]),
        'mean_Y': group['Y'].mean(),
        'std_dev_Y': group['Y'].std(),
        'energy_Y': np.sum(group['Y']**2),
        'entropy_Y': entropy(group['Y']),
        'num_peaks_Y': len(find_peaks(group['Y'])[0]),
        'mean_Z': group['Z'].mean(),
        'std_dev_Z': group['Z'].std(),
        'energy_Z': np.sum(group['Z']**2),
        'entropy_Z': entropy(group['Z']),
        'num_peaks_Z': len(find_peaks(group['Z'])[0]),
        'mean_Mixed': group['Mixed'].mean(), # If there is a 'Mixed' column
        'std_dev_Mixed': group['Mixed'].std(),
        'energy_Mixed': np.sum(group['Mixed']**2),
        'entropy_Mixed': entropy(group['Mixed']),
        'num_peaks_Mixed': len(find_peaks(group['Mixed'])[0])
    })
).reset_index()

# Save the extracted features to a new CSV file
statistical_features.to_csv('Dataset/statistical_features_by_subject.csv', index=False)

print(statistical_features)


   ClassLabel    mean_X  std_dev_X   energy_X  entropy_X  num_peaks_X  \
0           1  0.010924   0.122839  10.646201       -inf        151.0   
1           2 -0.000676   0.026156   0.637368       -inf        222.0   
2           3 -0.015991   0.128470   9.067586       -inf        144.0   
3           4  0.000201   0.104740  10.334273       -inf        237.0   
4           5  0.001170   0.117740   9.690939       -inf        171.0   

     mean_Y  std_dev_Y   energy_Y  entropy_Y  ...    mean_Z  std_dev_Z  \
0  0.017036   0.170924  20.653846       -inf  ... -0.012713   0.101693   
1 -0.000896   0.061759   3.551695       -inf  ... -0.000516   0.016447   
2  0.016216   0.204910  22.858013       -inf  ...  0.000674   0.103162   
3  0.012393   0.149617  21.231634       -inf  ...  0.001852   0.075815   
4  0.008643   0.194857  26.592696       -inf  ... -0.017182   0.080786   

   energy_Z  entropy_Z  num_peaks_Z  mean_Mixed  std_dev_Mixed  energy_Mixed  \
0  7.352300       -inf        185.0 

  statistical_features = df_cleaned.groupby('ClassLabel').apply(
