# 01 Data Quality Assesment Seriesdata

Import of libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

Define paths to data files

In [None]:
path_events_csv = '../data/train_events.csv'
path_sensor_parquet = '../data/train_series.parquet'

## Length of series

In [None]:
train_events = pd.read_csv(path_events_csv)
train_events['timestamp'] = pd.to_datetime(train_events['timestamp'])

In [None]:
train_events.dropna(inplace=True)

In [None]:
min_max_timestamp_of_series = train_events.groupby('series_id').agg({'timestamp': ['min','max']})

min_max_timestamp_of_series.columns = ['_'.join(col).strip() for col in min_max_timestamp_of_series.columns.values]
min_max_timestamp_of_series['timestamp_diff'] = min_max_timestamp_of_series['timestamp_max'] - min_max_timestamp_of_series['timestamp_min']
min_max_timestamp_of_series['timestamp_diff_days'] = [x.days for x in min_max_timestamp_of_series['timestamp_diff']]

In [None]:
mean_value = min_max_timestamp_of_series['timestamp_diff_days'].mean()
print(f"Mean: {mean_value}")

# Maximum
max_value = min_max_timestamp_of_series['timestamp_diff_days'].max()
print(f"Max: {max_value}")

# Minimum
min_value = min_max_timestamp_of_series['timestamp_diff_days'].min()
print(f"Min: {min_value}")

# Median
median_value = min_max_timestamp_of_series['timestamp_diff_days'].median()
print(f"Median: {median_value}")

# Standard deviation
std_deviation = min_max_timestamp_of_series['timestamp_diff_days'].std()
print(f"Standard Deviation: {std_deviation}")

# Variance
variance_value = min_max_timestamp_of_series['timestamp_diff_days'].var()
print(f"Variance: {variance_value}")

# Sum
sum_value = min_max_timestamp_of_series['timestamp_diff_days'].sum()
print(f"Sum: {sum_value}")

In [None]:
plt.hist(min_max_timestamp_of_series['timestamp_diff_days'], bins=15, color='blue', edgecolor='none')

plt.xlabel('Days')
plt.ylabel('Amount of series')
plt.title('Length of series')

plt.show()

## ENMO and Angle-Z

In [None]:
enmo_anglez = pd.read_parquet(path_sensor_parquet)[['enmo', 'anglez']]

In [None]:
ProfileReport(enmo_anglez)

In [None]:
fig, ax = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.3, .7)})

ax[0].boxplot(enmo_anglez['enmo'], vert=False, widths=0.5, patch_artist=True, showfliers=True)
ax[0].set_xlabel('')
ax[0].set_ylabel('')
ax[0].set_title('')

ax[0].spines['top'].set_visible(False)
ax[0].spines['right'].set_visible(False)
ax[0].spines['bottom'].set_visible(False)
ax[0].spines['left'].set_visible(False)

ax[0].set_xticks([])
ax[0].set_yticks([])

ax[0].set_title('ENMO distribution')


ax[1].hist(enmo_anglez['enmo'], bins=20, color='blue', edgecolor='none' , weights=np.ones(len(enmo_anglez['enmo'])) / len(enmo_anglez['enmo']))

ax[1].set_xlabel('ENMO value')
ax[1].set_yscale('log')
ax[1].set_ylabel('Percentage of datapoints')
ax[1].set_xticks(range(0,13))

plt.show()

In [None]:
fig, ax = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.3, .7)})

ax[0].boxplot(enmo_anglez['anglez'], vert=False, widths=0.5, patch_artist=True, showfliers=True)
ax[0].set_xlabel('')
ax[0].set_ylabel('')
ax[0].set_title('')

ax[0].spines['top'].set_visible(False)
ax[0].spines['right'].set_visible(False)
ax[0].spines['bottom'].set_visible(False)
ax[0].spines['left'].set_visible(False)

ax[0].set_xticks([])
ax[0].set_yticks([])

ax[0].set_title('Angle-Z distribution')


ax[1].hist(enmo_anglez['anglez'], bins=20, color='blue', edgecolor='none' , weights=np.ones(len(enmo_anglez['anglez'])) / len(enmo_anglez['anglez']))

ax[1].set_xlabel('Angle-Z value')
ax[1].set_ylabel('Percentage of datapoints')
ax[1].set_xticks(range(-90,90,25))



plt.show()

In [None]:
series = '44d8c02b369e'

train_series = pd.read_parquet(path_sensor_parquet, filters=[('series_id','=',series)])

train_series['timestamp'] = pd.to_datetime(train_series['timestamp'])
train_series['timestamp'] = train_series['timestamp'].apply(lambda x: x.replace(tzinfo=None))

plot_data = train_series[(train_series['timestamp'] > '2018-11-13 15:30:00') & (train_series['timestamp'] < '2018-11-17 15:30:00')]

In [None]:
train_events = pd.read_csv(path_events_csv).query('series_id == @series')
train_events.dropna(inplace=True)
train_events['timestamp'] = pd.to_datetime(train_events['timestamp'])
train_events['timestamp'] = train_events['timestamp'].apply(lambda x: x.replace(tzinfo=None))

train_events = train_events[(train_events['timestamp'] > '2018-11-13 15:30:00') & (train_events['timestamp'] < '2018-11-17 15:30:00')]

In [None]:
plt.figure(figsize=(22, 4))
plt.plot(plot_data['timestamp'], plot_data['enmo'], label='ENMO', linewidth=0.5, color='black')

handles, labels = plt.gca().get_legend_handles_labels()
new_labels, new_handles = [], []
for handle, label in zip(handles, labels):
    if label not in new_labels:
        new_handles.append(handle)
        new_labels.append(label)

plt.ylim(0, 1)

plt.legend(new_handles, new_labels, fontsize=20)
plt.xlabel('Timestamp', fontsize=20, labelpad=15)
plt.ylabel('ENMO value', fontsize=20, labelpad=20)
plt.title(f'ENMO value over three days', fontsize=20)
plt.xticks(fontsize=20)  
plt.yticks(fontsize=20)

In [None]:
plt.figure(figsize=(22, 4))
plt.plot(plot_data['timestamp'], plot_data['anglez'], label='Angle-Z', linewidth=0.5, color='black')

handles, labels = plt.gca().get_legend_handles_labels()
new_labels, new_handles = [], []
for handle, label in zip(handles, labels):
    if label not in new_labels:
        new_handles.append(handle)
        new_labels.append(label)

plt.ylim(-90, 90)

plt.legend(new_handles, new_labels, fontsize=20)
plt.xlabel('Timestamp', fontsize=20, labelpad=15)
plt.ylabel('Angle-Z value', fontsize=20, labelpad=20)
plt.title(f'Angle-Z value over three days', fontsize=20)
plt.xticks(fontsize=20)  
plt.yticks(fontsize=20)

In [None]:
onsets = train_events[train_events['event'] == 'onset']
wakeups = train_events[train_events['event'] == 'wakeup']

plt.figure(figsize=(22, 4))

for _, onset in onsets.iterrows():
    plt.axvline(x=onset['timestamp'], color='darkgreen', linestyle='-', label='onset event', linewidth=3.5)

for _, wakeup in wakeups.iterrows():
    plt.axvline(x=wakeup['timestamp'], color='darkorange', linestyle='-', label='wakeup event', linewidth=3.5)

plt.plot(plot_data['timestamp'], plot_data['anglez'], label='Angle-Z', linewidth=0.5, color='black')

handles, labels = plt.gca().get_legend_handles_labels()
new_labels, new_handles = [], []
for handle, label in zip(handles, labels):
    if label not in new_labels:
        new_handles.append(handle)
        new_labels.append(label)

plt.ylim(-90, 90)

plt.legend(new_handles, new_labels, fontsize=20)
plt.xlabel('Timestamp', fontsize=20, labelpad=15)
plt.ylabel('Angle-Z value', fontsize=20, labelpad=20)
plt.title(f'Angle-Z value over three days with events', fontsize=20)
plt.xticks(fontsize=20)  
plt.yticks(fontsize=20)

In [None]:
onsets = train_events[train_events['event'] == 'onset']
wakeups = train_events[train_events['event'] == 'wakeup']

plt.figure(figsize=(22, 4))


for _, onset in onsets.iterrows():
    plt.axvline(x=onset['timestamp'], color='darkgreen', linestyle='-', label='onset event', linewidth=3.5)

for _, wakeup in wakeups.iterrows():
    plt.axvline(x=wakeup['timestamp'], color='darkorange', linestyle='-', label='wakeup event', linewidth=3.5)


plt.plot(plot_data['timestamp'], plot_data['enmo'], label='ENMO', linewidth=0.5, color='black')

handles, labels = plt.gca().get_legend_handles_labels()
new_labels, new_handles = [], []
for handle, label in zip(handles, labels):
    if label not in new_labels:
        new_handles.append(handle)
        new_labels.append(label)

plt.ylim(0, 1)

plt.legend(new_handles, new_labels, fontsize=20)
plt.xlabel('Timestamp', fontsize=20, labelpad=15)
plt.ylabel('ENMO value', fontsize=20, labelpad=20)
plt.title(f'ENMO value over three days with events', fontsize=20)
plt.xticks(fontsize=20)  
plt.yticks(fontsize=20)

## Distribution of data over time

In [None]:
train_series = pd.read_parquet(path_sensor_parquet)[['series_id', 'timestamp']]

train_series = train_series.groupby('series_id').apply(lambda x: x.iloc[0::17280]).reset_index(drop=True)

train_series['timestamp'] = pd.to_datetime(train_series['timestamp'])
train_series['timestamp'] = train_series['timestamp'].apply(lambda x: x.replace(tzinfo=None))

In [None]:
train_series['timestamp'].min()

In [None]:
train_series['timestamp'].max()

In [None]:
(train_series['timestamp'].max() - train_series['timestamp'].min()).days

In [None]:
# create dataframe with column for each date between max and min date. And fill foreach series allall values 0.
column_names = pd.date_range(start=train_series['timestamp'].min(), end=train_series['timestamp'].max(), freq='D').strftime('%Y-%m-%d')
df = pd.DataFrame(columns=column_names)
for i in range(len(train_series['series_id'].unique())):
    df.loc[i] = 0

df = df.set_index(train_series['series_id'].unique())

In [None]:
# replace the 0 value with 1 if a datarecord exists
for serie in train_series['series_id'].unique():
    serie_data = train_series[train_series['series_id'] == serie]
    serie_data['timestamp_date'] = serie_data['timestamp'].dt.strftime('%Y-%m-%d')

    max_serie_date = serie_data['timestamp'].max()
    min_serie_date = serie_data['timestamp'].min()

    serie_interval = pd.date_range(start=min_serie_date, end=max_serie_date, freq='D').strftime('%Y-%m-%d')

    for x in serie_interval: 
        if (serie_data['timestamp_date'] == x).any():
            df.at[serie, x] = 1
        else:
            df.at[serie, x] = -1

def set_value(x):
    if x > 0.5:
        return 1
    else:
        return 0

df = df.applymap(lambda x: set_value(x))

In [None]:
custom_palette = sns.color_palette(["#000000", "#FFFFFF"])

cg = sns.clustermap(df, col_cluster=False, figsize=(15, 8), cmap=custom_palette, cbar_kws={"ticks":[0.25,0.75], "drawedges": True}, yticklabels=10, xticklabels=50, cbar_pos=(0.05, 0.6, 0.05, 0.18))
cg.ax_row_dendrogram.set_visible(False)
cg.ax_cbar.set_yticklabels(['No data available', 'Data available'])

cg.ax_cbar.spines['top'].set_visible(True)
cg.ax_cbar.spines['right'].set_visible(True)
cg.ax_cbar.spines['bottom'].set_visible(True)
cg.ax_cbar.spines['left'].set_visible(True)


## Correlation of data

In [None]:
train_series = pd.read_parquet(path_sensor_parquet)[['enmo', 'anglez']]

correlation = train_series.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

## Repetitive patterns

In [None]:
train_events = pd.read_csv(path_events_csv).query('series_id == @series')
train_events.dropna(inplace=True)

train_events['timestamp'] = pd.to_datetime(train_events['timestamp'])
train_events['timestamp'] = train_events['timestamp'].apply(lambda x: x.replace(tzinfo=None))

train_events = train_events[(train_events['timestamp'] > '2018-10-10 15:30:00') & (train_events['timestamp'] < '2018-10-22 15:30:00')]

onsets = train_events[train_events['event'] == 'onset']
wakeups = train_events[train_events['event'] == 'wakeup']

plot_data = pd.read_parquet(path_sensor_parquet, filters=[('series_id','=',series)])

plot_data = plot_data[(plot_data['step'] > 159000) & (plot_data['step'] < 260000)]

plot_data['timestamp'] = pd.to_datetime(plot_data['timestamp'])
plot_data['timestamp'] = plot_data['timestamp'].apply(lambda x: x.replace(tzinfo=None))

In [None]:
plt.figure(figsize=(22, 4))

for _, onset in onsets.iterrows():
    plt.axvline(x=onset['timestamp'], color='darkgreen', linestyle='-', label='onset event', linewidth=3.5)

for _, wakeup in wakeups.iterrows():
    plt.axvline(x=wakeup['timestamp'], color='darkorange', linestyle='-', label='wakeup event', linewidth=3.5)

plt.plot(plot_data['timestamp'], plot_data['anglez'], label='Angle-Z', linewidth=0.5, color='black')

handles, labels = plt.gca().get_legend_handles_labels()
new_labels, new_handles = [], []
for handle, label in zip(handles, labels):
    if label not in new_labels:
        new_handles.append(handle)
        new_labels.append(label)

plt.ylim(-90, 90)

plt.legend(new_handles, new_labels, fontsize=20)
plt.xlabel('Timestamp', fontsize=20, labelpad=15)
plt.ylabel('Angle-Z value', fontsize=20, labelpad=20)
plt.title(f'Angle-Z value in regular nights and repetetive pattern', fontsize=20)
plt.xticks(fontsize=20)  
plt.yticks(fontsize=20)

In [None]:
plt.figure(figsize=(22, 4))

for _, onset in onsets.iterrows():
    plt.axvline(x=onset['timestamp'], color='darkgreen', linestyle='-', label='onset event', linewidth=3.5)

for _, wakeup in wakeups.iterrows():
    plt.axvline(x=wakeup['timestamp'], color='darkorange', linestyle='-', label='wakeup event', linewidth=3.5)


plt.plot(plot_data['timestamp'], plot_data['enmo'], label='ENMO', linewidth=0.5, color='black')

handles, labels = plt.gca().get_legend_handles_labels()
new_labels, new_handles = [], []
for handle, label in zip(handles, labels):
    if label not in new_labels:
        new_handles.append(handle)
        new_labels.append(label)

plt.ylim(0, 1)

plt.legend(new_handles, new_labels, fontsize=20)
plt.xlabel('Timestamp', fontsize=20, labelpad=15)
plt.ylabel('ENMO value', fontsize=20, labelpad=20)
plt.title(f'ENMO value in regular nights and repetetive pattern', fontsize=20)
plt.xticks(fontsize=20)  
plt.yticks(fontsize=20)

## Difference in repetitive nights

In [None]:
plot_data['enmo_diff_previous'] = abs(plot_data['enmo'] - plot_data['enmo'].shift(-60 * 24))
plot_data['enmo_diff_next'] = abs(plot_data['enmo'] - plot_data['enmo'].shift(60 * 24))

plt.figure(figsize=(22, 4))


for _, onset in onsets.iterrows():
    plt.axvline(x=onset['timestamp'], color='darkgreen', linestyle='-', label='onset event', linewidth=3.5)

for _, wakeup in wakeups.iterrows():
    plt.axvline(x=wakeup['timestamp'], color='darkorange', linestyle='-', label='wakeup event', linewidth=3.5)


plt.plot(plot_data['timestamp'], plot_data['enmo'], label='ENMO', linewidth=0.5, color='black')

plt.plot(plot_data['timestamp'], plot_data['enmo_diff_previous'], label='ENMO difference to previous 24 hours', linewidth=0.5, color='red')
# plt.plot(plot_data['timestamp'], plot_data['enmo_diff_next'], label='enmo_diff_next', linewidth=0.5, color='black')


handles, labels = plt.gca().get_legend_handles_labels()
new_labels, new_handles = [], []
for handle, label in zip(handles, labels):
    if label not in new_labels:
        new_handles.append(handle)
        new_labels.append(label)

plt.ylim(-0.01, 1)

plt.legend(new_handles, new_labels, fontsize=20)
plt.xlabel('Timestamp', fontsize=20, labelpad=15)
plt.ylabel('ENMO value', fontsize=20, labelpad=20)
plt.title(f'Change in ENMO value compared with 24 hours before in regular nights and repetetive pattern', fontsize=20)
plt.xticks(fontsize=20)  
plt.yticks(fontsize=20)

In [None]:

plot_data['enmo_diff_previous'] = abs(plot_data['anglez'] - plot_data['anglez'].shift(-60 * 24))
plot_data['enmo_diff_next'] = abs(plot_data['anglez'] - plot_data['anglez'].shift(60 * 24))

plt.figure(figsize=(22, 4))

for _, onset in onsets.iterrows():
    plt.axvline(x=onset['timestamp'], color='darkgreen', linestyle='-', label='onset event', linewidth=3.5)

for _, wakeup in wakeups.iterrows():
    plt.axvline(x=wakeup['timestamp'], color='darkorange', linestyle='-', label='wakeup event', linewidth=3.5)


plt.plot(plot_data['timestamp'], plot_data['anglez'], label='Angle-Z', linewidth=0.5, color='black')

plt.plot(plot_data['timestamp'], plot_data['enmo_diff_previous'], label='Angle-Z difference to previous 24 hours', linewidth=0.5, color='red')
# plt.plot(plot_data['timestamp'], plot_data['enmo_diff_next'], label='enmo_diff_next', linewidth=0.5, color='black')

handles, labels = plt.gca().get_legend_handles_labels()
new_labels, new_handles = [], []
for handle, label in zip(handles, labels):
    if label not in new_labels:
        new_handles.append(handle)
        new_labels.append(label)

plt.ylim(-90, 90)

plt.legend(new_handles, new_labels, fontsize=20, loc='upper right')
plt.xlabel('Timestamp', fontsize=20, labelpad=15)
plt.ylabel('Angle-Z value', fontsize=20, labelpad=20)
plt.title(f'Change in Angle-Z value compared with 24 hours before in regular nights and repetetive pattern', fontsize=20)
plt.xticks(fontsize=20)  
plt.yticks(fontsize=20)