In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.special
from sklearn.mixture import GaussianMixture

### load data

In [2]:
df = pd.read_csv('data/cycling 2024-06-07 12-40-37/merged.csv')
df.head()

Unnamed: 0,Time window,Time (s)_acc,Time (s)_gyro,Time (s)_mag,Time (s)_lin_acc,Time (s)_prox,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (rad/s),Gyroscope y (rad/s),Gyroscope z (rad/s),Magnetic field x (µT),Magnetic field y (µT),Magnetic field z (µT),Linear Acceleration x (m/s^2),Linear Acceleration y (m/s^2),Linear Acceleration z (m/s^2),Distance (cm)
0,0.03,,0.052486,,,,,,,-0.003665,-0.595118,0.107356,,,,,,,
1,0.06,0.073536,,,,,-2.012962,3.975974,8.375178,,,,,,,,,,
2,0.09,,0.104155,,0.11792,,,,,-0.117435,-0.919478,0.298399,,,,0.249222,0.435787,0.179174,
3,0.12,0.123536,,0.130751,,,-1.18684,4.208377,9.069396,,,,15.1768,-40.2966,-38.539799,,,,
4,0.15,0.173536,0.154962,,0.16792,,-1.727917,3.549752,8.52114,-0.142786,-0.889394,0.224028,,,,-0.620777,-0.062557,0.038412,


In [3]:
data_cols = [c for c in df.columns if not c.startswith('Time')]

### remove noise

In [4]:
def chauvenet(data_table, col, C):
        mean = data_table[col].mean()
        std = data_table[col].std()
        N = len(data_table.index)
        criterion = 1.0/(C*N)

        deviation = abs(data_table[col] - mean)/std

        low = -deviation/np.sqrt(C)
        high = deviation/np.sqrt(C)
        prob = []
        mask = []

        # Pass all rows in the dataset.
        for i in range(0, len(data_table.index)):
            # Determine the probability of observing the point
            prob.append(
                1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i])))
            # And mark as an outlier when the probability is below our criterion.
            mask.append(prob[i] < criterion)
        data_table[col + '_outlier'] = mask
        return data_table

In [13]:
chauvenets_value = 2 # recommended 1 to 10
for c in data_cols:
    df = chauvenet(df, c, chauvenets_value)
df.head()

Unnamed: 0,Time window,Time (s)_acc,Time (s)_gyro,Time (s)_mag,Time (s)_lin_acc,Time (s)_prox,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (rad/s),...,Gyroscope x (rad/s)_outlier,Gyroscope y (rad/s)_outlier,Gyroscope z (rad/s)_outlier,Magnetic field x (µT)_outlier,Magnetic field y (µT)_outlier,Magnetic field z (µT)_outlier,Linear Acceleration x (m/s^2)_outlier,Linear Acceleration y (m/s^2)_outlier,Linear Acceleration z (m/s^2)_outlier,Distance (cm)_outlier
0,0.03,,0.052486,,,,,,,-0.003665,...,False,False,False,False,False,False,False,False,False,False
1,0.06,0.073536,,,,,-2.012962,3.975974,8.375178,,...,False,False,False,False,False,False,False,False,False,False
2,0.09,,0.104155,,0.11792,,,,,-0.117435,...,False,False,False,False,False,False,False,False,False,False
3,0.12,0.123536,,0.130751,,,-1.18684,4.208377,9.069396,,...,False,False,False,False,False,False,False,False,False,False
4,0.15,0.173536,0.154962,,0.16792,,-1.727917,3.549752,8.52114,-0.142786,...,False,False,False,False,False,False,False,False,False,False


In [14]:
for c in [c for c in df.columns if c.endswith('outlier')]:
    print(c, df[c].sum(), sep='\t')

Acceleration x (m/s^2)_outlier	25
Acceleration y (m/s^2)_outlier	6
Acceleration z (m/s^2)_outlier	8
Gyroscope x (rad/s)_outlier	0
Gyroscope y (rad/s)_outlier	14
Gyroscope z (rad/s)_outlier	8
Magnetic field x (µT)_outlier	0
Magnetic field y (µT)_outlier	27
Magnetic field z (µT)_outlier	29
Linear Acceleration x (m/s^2)_outlier	13
Linear Acceleration y (m/s^2)_outlier	13
Linear Acceleration z (m/s^2)_outlier	2
Distance (cm)_outlier	0


In [16]:
def mixture_model(data_table, col):
        data = data_table[data_table[col].notnull()][col]
        g = GaussianMixture(n_components=3, max_iter=100, n_init=1)
        reshaped_data = np.array(data.values.reshape(-1, 1))
        g.fit(reshaped_data)

        # Predict the probabilities
        probs = g.score_samples(reshaped_data)

        # Create the right data frame and concatenate the two.
        data_probs = pd.DataFrame(
            np.power(10, probs), index=data.index, columns=[col+'_mixture'])

        data_table = pd.concat([data_table, data_probs], axis=1)

        return data_table

In [17]:
for c in data_cols:
    df = mixture_model(df, c)

for c in [c for c in df.columns if c.endswith('mixture')]:
    print(c, df[c].sum(), sep='\t')

Applying mixture models
Applying mixture models
Applying mixture models
Applying mixture models
Applying mixture models
Applying mixture models
Applying mixture models
Applying mixture models
Applying mixture models
Applying mixture models
Applying mixture models
Applying mixture models
Applying mixture models
Acceleration x (m/s^2)_mixture	149.51361671721185
Acceleration y (m/s^2)_mixture	19.148895535592267
Acceleration z (m/s^2)_mixture	93.6459894197704
Gyroscope x (rad/s)_mixture	112.46636255875437
Gyroscope y (rad/s)_mixture	2332.9018934121423
Gyroscope z (rad/s)_mixture	1714.2802787338433
Magnetic field x (µT)_mixture	9.107803416343007
Magnetic field y (µT)_mixture	0.8095648516423108
Magnetic field z (µT)_mixture	7.048569722248847
Linear Acceleration x (m/s^2)_mixture	194.20532806610336
Linear Acceleration y (m/s^2)_mixture	89.29276697622183
Linear Acceleration z (m/s^2)_mixture	62.42239752232537
Distance (cm)_mixture	1138154.6749013015


  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,


### fill nans

In [26]:
for col in data_cols:
    df[col] = df[col].interpolate()
    df[col] = df[col].fillna(method='bfill')