# Data Quality Assesment Eventdata

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

In [None]:
path_events_csv = '../data/train_events.csv'

In [None]:
train_events = pd.read_csv(path_events_csv)

train_events.dropna(inplace=True)

train_events['timestamp'] = pd.to_datetime(train_events['timestamp'])
train_events['timestamp'] = train_events['timestamp'].apply(lambda x: x.replace(tzinfo=None))

In [None]:
ProfileReport(train_events)

In [None]:
df_onset = train_events[train_events['event'] == 'onset']
df_onset['hour'] = df_onset['timestamp'].dt.hour

df_wakeup = train_events[train_events['event'] == 'wakeup']
df_wakeup['hour'] = df_wakeup['timestamp'].dt.hour

plt.hist(df_onset['hour'].dropna(), bins=24, label='Onset', color='blue',)
plt.hist(df_wakeup['hour'].dropna(), bins=24, label='Wakeup', color='orange')

plt.xlabel('Hour of Event')
plt.ylabel('Frequency')
plt.title('Distribution of Hour of Wakeup')
plt.legend()

plt.show()

In [None]:
ProfileReport(df_onset)

In [None]:
ProfileReport(df_wakeup)

## Duration of sleep and awake windows

In [None]:
for serie in train_events['series_id'].unique():
    serie_data = train_events[train_events['series_id'] == serie]

    train_events.loc[train_events['series_id'] == serie, 'step_diff'] = abs(serie_data['step'] - serie_data['step'].shift(-1))

train_events['step_diff'] = train_events['step_diff'].fillna(0).astype(int)

train_events

In [None]:
df_onset = train_events[(train_events['event'] == 'onset') & (train_events['step_diff'] < 14000) & (train_events['step_diff'] > 0)]

plt.figure(figsize=(22, 4))

plt.hist(df_onset['step_diff'].dropna(), bins=20, label='Onset', color='blue', edgecolor='none')

plt.xlim(0, 14500)

plt.xlabel('Duration of Event')
plt.ylabel('Frequency')
plt.title('Distribution of duration of sleeping windows')
plt.legend()

plt.show()

In [None]:
df_wakeup = train_events[(train_events['event'] == 'wakeup') & (train_events['step_diff'] < 14000) & (train_events['step_diff'] > 0)]

plt.figure(figsize=(22, 4))

plt.hist(df_wakeup['step_diff'].dropna(), bins=20, label='Wakeup', color='orange', edgecolor='none')

plt.xlim(0, 14500)

plt.xlabel('Duration of Event')
plt.ylabel('Frequency')
plt.title('Distribution of duration of awake windows')
plt.legend()

plt.show()

## Correlation of data

In [None]:
correlation_matrix = train_events.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix Heatmap')
plt.show()