# EDA for Segment

Description

In [1]:
# Reload the notebook if an external file is updated
%load_ext autoreload
%autoreload 2

import sys

from pathlib import Path

path = str(Path.cwd().parent)
sys.path.append(path)

In [2]:
import seaborn as sns

from datatype.dataset import Dataset
from matplotlib import pyplot as plt

In [3]:
dataset = Dataset('ignore')
dataframe = dataset.load()

In [4]:
folders = dataframe.folder.unique()

In [5]:
def get_exclusion_count(dataframe):
    exclusions = []
    
    for folder in folders:
        subset = dataframe[dataframe.folder == folder]

        exclusions.append(
            subset.exclude.sum()
        )

    return exclusions

In [6]:
def get_recording_count(dataframe):
    recordings = []
    
    for folder in folders:
        subset = dataframe[dataframe.folder == folder]

        recordings.append(
            len(
                subset.filename.unique()
            )
        )

    return recordings

In [7]:
def get_segment_count(dataframe):
    segments = []
    
    for folder in folders:
        subset = dataframe[dataframe.folder == folder]
        filename = subset.filename.value_counts().sum()
        
        segments.append(filename)

    return segments

In [8]:
def get_segment_mean_duration(dataframe):
    durations = []
    
    for folder in folders:
        subset = dataframe[dataframe.folder == folder]
        duration = subset.duration.mean()
        
        durations.append(duration)

    return durations

In [9]:
def get_recording_mean_duration(dataframe):
    durations = []
    
    for folder in folders:
        subset = dataframe[dataframe.folder == folder]
        
        duration = (
            subset
            .groupby('filename')
            .first()
            .signal
            .apply(lambda s: s.duration)
            .to_numpy()
            .mean()
        )

        durations.append(duration)

    return durations

In [10]:
exclusions = get_exclusion_count(dataframe)

fig, ax = plt.subplots(
    figsize=(18, 12)
)

ax.set_title('Segment Exclusion per Individual')
ax.set_xlabel('Exclusion')
ax.set_ylabel('Individual')

sns.barplot(x=exclusions, y=folders, orient='h')
plt.show()
plt.close()

AttributeError: 'DataFrame' object has no attribute 'exclude'

In [None]:
recordings = get_recording_count(dataframe)

fig, ax = plt.subplots(
    figsize=(18, 12)
)

ax.set_title('Recordings per Individual')
ax.set_xlabel('Recordings')
ax.set_ylabel('Individual')

sns.barplot(x=recordings, y=folders, orient='h')
plt.show()
plt.close()

In [None]:
segments = get_segment_count(dataframe)

fig, ax = plt.subplots(
    figsize=(18, 12)
)

ax.set_title('Segments per Individual')
ax.set_xlabel('Segments')
ax.set_ylabel('Individual')

sns.barplot(x=segments, y=folders, orient='h')
plt.show()
plt.close()

In [None]:
durations = get_recording_mean_duration(dataframe)

fig, ax = plt.subplots(
    figsize=(18, 12)
)

ax.set_title('Mean Recording Duration per Individual')
ax.set_xlabel('Mean Recording Duration (s)')
ax.set_ylabel('Individual')

sns.barplot(x=durations, y=folders, orient='h')
plt.show()
plt.close()

In [None]:
durations = get_segment_mean_duration(dataframe)

fig, ax = plt.subplots(
    figsize=(18, 12)
)

ax.set_title('Mean Segment Duration per Individual')
ax.set_xlabel('Mean Segment Duration (s)')
ax.set_ylabel('Individual')

sns.barplot(x=durations, y=folders, orient='h')
plt.show()
plt.close()